Update the OCR app to work based on document versions not documents, document version are the module which hold the document pages instances. Remove old OCR document queue and replace with a single module for OCR processing error entries. Increase compatibility with Django 1.7 and Python 3.
This commit is contained in:
@@ -1,61 +1,75 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.dispatch import receiver
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from south.signals import post_migrate
|
||||
|
||||
from acls.api import class_permissions
|
||||
from common.utils import encapsulate
|
||||
from documents.models import Document, DocumentVersion
|
||||
from documents.signals import post_version_upload
|
||||
from documents.widgets import document_link
|
||||
from main.api import register_maintenance_links
|
||||
from navigation.api import register_links
|
||||
from navigation.api import register_links, register_model_list_columns
|
||||
from navigation.links import link_spacer
|
||||
from project_tools.api import register_tool
|
||||
from rest_api.classes import APIEndPoint
|
||||
|
||||
from .links import (all_document_ocr_cleanup, ocr_tool_link,
|
||||
queue_document_list, queue_document_multiple_delete,
|
||||
re_queue_multiple_document, submit_document,
|
||||
submit_document_multiple)
|
||||
from .models import DocumentQueue
|
||||
from .links import (
|
||||
link_document_all_ocr_cleanup, link_document_submit,
|
||||
link_document_submit_multiple, link_entry_delete,
|
||||
link_entry_delete_multiple, link_entry_list, link_entry_re_queue,
|
||||
link_entry_re_queue_multiple
|
||||
)
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import PERMISSION_OCR_DOCUMENT
|
||||
from .tasks import task_do_ocr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
register_links(Document, [submit_document])
|
||||
register_links([Document], [submit_document_multiple, link_spacer], menu_name='multi_item_links')
|
||||
register_links(['ocr:queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete])
|
||||
register_links(['ocr:queue_document_list'], [queue_document_list], menu_name='secondary_menu')
|
||||
register_links(Document, [link_document_submit])
|
||||
register_links([Document], [link_document_submit_multiple, link_spacer], menu_name='multi_item_links')
|
||||
|
||||
register_maintenance_links([all_document_ocr_cleanup], namespace='ocr', title=_(u'OCR'))
|
||||
register_links([DocumentVersionOCRError], [link_entry_re_queue_multiple, link_entry_delete_multiple, link_spacer], menu_name='multi_item_links')
|
||||
register_links([DocumentVersionOCRError], [link_entry_re_queue, link_entry_delete])
|
||||
register_links(['ocr:entry_list', 'ocr:entry_delete_multiple', 'ocr:entry_re_queue_multiple', DocumentVersionOCRError], [link_entry_list], menu_name='secondary_menu')
|
||||
register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR'))
|
||||
|
||||
|
||||
def document_ocr_submit(self):
|
||||
task_do_ocr.apply_async(args=[self.pk], queue='ocr')
|
||||
|
||||
|
||||
def document_version_ocr_submit(self):
|
||||
task_do_ocr.apply_async(args=[self.document.pk], queue='ocr')
|
||||
|
||||
|
||||
@receiver(post_version_upload, dispatch_uid='post_version_upload_ocr', sender=DocumentVersion)
|
||||
def post_version_upload_ocr(sender, instance, **kwargs):
|
||||
logger.debug('received post_version_upload')
|
||||
logger.debug('instance.document: %s', instance.document)
|
||||
logger.debug('instance pk: %s', instance.pk)
|
||||
if instance.document.document_type.ocr:
|
||||
instance.document.submit_for_ocr()
|
||||
|
||||
|
||||
@receiver(post_migrate, dispatch_uid='create_default_queue')
|
||||
def create_default_queue_signal_handler(sender, **kwargs):
|
||||
if kwargs['app'] == 'ocr':
|
||||
DocumentQueue.objects.get_or_create(name='default')
|
||||
instance.submit_for_ocr()
|
||||
|
||||
|
||||
Document.add_to_class('submit_for_ocr', document_ocr_submit)
|
||||
DocumentVersion.add_to_class('submit_for_ocr', document_version_ocr_submit)
|
||||
|
||||
class_permissions(Document, [PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
register_tool(ocr_tool_link)
|
||||
register_tool(link_entry_list)
|
||||
|
||||
APIEndPoint('ocr')
|
||||
|
||||
register_model_list_columns(DocumentVersionOCRError, [
|
||||
{
|
||||
'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document))
|
||||
},
|
||||
{
|
||||
'name': _('Added'), 'attribute': 'datetime_submitted'
|
||||
},
|
||||
{
|
||||
'name': _('Result'), 'attribute': 'result'
|
||||
},
|
||||
])
|
||||
|
||||
@@ -1,20 +1,13 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import DocumentQueue, QueueDocument
|
||||
from .models import DocumentVersionOCRError
|
||||
|
||||
|
||||
class QueueDocumentInline(admin.StackedInline):
|
||||
model = QueueDocument
|
||||
extra = 1
|
||||
classes = ('collapse-open',)
|
||||
allow_add = True
|
||||
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_version', 'datetime_submitted')
|
||||
readonly_fields = ('document_version', 'datetime_submitted', 'result')
|
||||
|
||||
|
||||
class DocumentQueueAdmin(admin.ModelAdmin):
|
||||
inlines = [QueueDocumentInline]
|
||||
list_display = ('name', 'label')
|
||||
|
||||
|
||||
admin.site.register(DocumentQueue, DocumentQueueAdmin)
|
||||
admin.site.register(DocumentVersionOCRError, DocumentVersionOCRErrorAdmin)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
@@ -30,14 +30,14 @@ except sh.CommandNotFound:
|
||||
UNPAPER = None
|
||||
|
||||
|
||||
def do_document_ocr(document):
|
||||
def do_document_ocr(document_version):
|
||||
"""
|
||||
Try first to extract text from document pages using the registered
|
||||
parser, if the parser fails or if there is no parser registered for
|
||||
the document mimetype do a visual OCR by calling the corresponding
|
||||
OCR backend
|
||||
"""
|
||||
for document_page in document.pages.all():
|
||||
for document_page in document_version.pages.all():
|
||||
try:
|
||||
# Try to extract text by means of a parser
|
||||
parse_document_page(document_page)
|
||||
@@ -68,10 +68,10 @@ def do_document_ocr(document):
|
||||
|
||||
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
|
||||
try:
|
||||
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document.language)
|
||||
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)
|
||||
|
||||
document_page.content = ocr_cleanup(document.language, ocr_text)
|
||||
document_page.page_label = _(u'Text from OCR')
|
||||
document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
|
||||
document_page.page_label = _('Text from OCR')
|
||||
document_page.save()
|
||||
finally:
|
||||
fs_cleanup(pre_ocr_filepath_w_ext)
|
||||
@@ -86,7 +86,7 @@ def ocr_cleanup(language, text):
|
||||
cleanup filter
|
||||
"""
|
||||
try:
|
||||
language_backend = load_backend(u'.'.join([u'ocr', u'lang', language, u'LanguageBackend']))()
|
||||
language_backend = load_backend('.'.join(['ocr', 'lang', language, 'LanguageBackend']))()
|
||||
except ImportError:
|
||||
language_backend = None
|
||||
|
||||
@@ -104,9 +104,9 @@ def ocr_cleanup(language, text):
|
||||
result = word
|
||||
if result:
|
||||
output.append(result)
|
||||
output.append(u'\n')
|
||||
output.append('\n')
|
||||
|
||||
return u' '.join(output)
|
||||
return ' '.join(output)
|
||||
|
||||
|
||||
def clean_pages():
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.core.exceptions import PermissionDenied
|
||||
from django.shortcuts import get_object_or_404
|
||||
@@ -8,33 +8,33 @@ from rest_framework.response import Response
|
||||
from rest_framework.settings import api_settings
|
||||
|
||||
from acls.models import AccessEntry
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentVersion
|
||||
from permissions.models import Permission
|
||||
from rest_api.permissions import MayanPermission
|
||||
|
||||
from .permissions import PERMISSION_OCR_DOCUMENT
|
||||
from .serializers import DocumentOCRSerializer
|
||||
from .serializers import DocumentVersionOCRSerializer
|
||||
|
||||
|
||||
class DocumentOCRView(generics.GenericAPIView):
|
||||
serializer_class = DocumentOCRSerializer
|
||||
class DocumentVersionOCRView(generics.GenericAPIView):
|
||||
serializer_class = DocumentVersionOCRSerializer
|
||||
|
||||
permission_classes = (MayanPermission,)
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
"""Submit document OCR queue."""
|
||||
"""Submit document version for OCR."""
|
||||
|
||||
serializer = self.get_serializer(data=request.DATA, files=request.FILES)
|
||||
|
||||
if serializer.is_valid():
|
||||
document = get_object_or_404(Document, pk=serializer.data['document_id'])
|
||||
document_version = get_object_or_404(DocumentVersion, pk=serializer.data['document_version_id'])
|
||||
|
||||
try:
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
except PermissionDenied:
|
||||
AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document)
|
||||
AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document_version.document)
|
||||
|
||||
document.submit_for_ocr()
|
||||
document_version.submit_for_ocr()
|
||||
|
||||
headers = self.get_success_headers(serializer.data)
|
||||
return Response(serializer.data, status=status.HTTP_202_ACCEPTED,
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
class BackendBase(object):
|
||||
def execute(self, input_filename, language=None): # NOQA
|
||||
def execute(self, input_filename, language=None):
|
||||
raise NotImplementedError
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import codecs
|
||||
import errno
|
||||
@@ -20,11 +20,11 @@ class Tesseract(BackendBase):
|
||||
"""
|
||||
fd, filepath = tempfile.mkstemp()
|
||||
os.close(fd)
|
||||
ocr_output = os.extsep.join([filepath, u'txt'])
|
||||
ocr_output = os.extsep.join([filepath, 'txt'])
|
||||
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
|
||||
|
||||
if language is not None:
|
||||
command.extend([u'-l', language])
|
||||
command.extend(['-l', language])
|
||||
|
||||
try:
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
"""
|
||||
Raised by the OCR backend
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import re
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
@@ -6,14 +6,13 @@ from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES,
|
||||
PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE)
|
||||
|
||||
submit_document = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
submit_document_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document_multiple', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
re_queue_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
re_queue_multiple_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_multiple_document', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
queue_document_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
|
||||
queue_document_multiple_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
|
||||
link_document_submit = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
link_document_submit_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit_multiple', 'famfam': 'hourglass_add'}
|
||||
link_entry_re_queue = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
link_entry_re_queue_multiple = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue_multiple', 'famfam': 'hourglass_add'}
|
||||
link_entry_delete = {'text': _('Delete'), 'view': 'ocr:entry_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
|
||||
link_entry_delete_multiple = {'text': _('Delete'), 'view': 'ocr:entry_delete_multiple', 'famfam': 'hourglass_delete'}
|
||||
|
||||
all_document_ocr_cleanup = {'text': _(u'Clean up pages content'), 'view': 'ocr:all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')}
|
||||
link_document_all_ocr_cleanup = {'text': _('Clean up pages content'), 'view': 'ocr:document_all_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _('Runs a language filter to remove common OCR mistakes from document pages content.')}
|
||||
|
||||
queue_document_list = {'text': _(u'Queue document list'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
ocr_tool_link = {'text': _(u'OCR'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
link_entry_list = {'text': _('OCR Errors'), 'view': 'ocr:entry_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]}
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
DEFAULT_OCR_FILE_FORMAT = u'tiff'
|
||||
DEFAULT_OCR_FILE_EXTENSION = u'tif'
|
||||
from __future__ import unicode_literals
|
||||
|
||||
DEFAULT_OCR_FILE_FORMAT = 'tiff'
|
||||
DEFAULT_OCR_FILE_EXTENSION = 'tif'
|
||||
LOCK_EXPIRE = 60 * 10 # Adjust to worst case scenario
|
||||
UNPAPER_FILE_FORMAT = u'ppm'
|
||||
UNPAPER_FILE_FORMAT = 'ppm'
|
||||
|
||||
@@ -1,39 +1,22 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import models
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
from django.utils.translation import ugettext
|
||||
from django.utils.encoding import python_2_unicode_compatible
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentVersion
|
||||
|
||||
|
||||
class DocumentQueue(models.Model):
|
||||
name = models.CharField(max_length=64, unique=True, verbose_name=_(u'Name'))
|
||||
label = models.CharField(max_length=64, verbose_name=_(u'Label'))
|
||||
@python_2_unicode_compatible
|
||||
class DocumentVersionOCRError(models.Model):
|
||||
document_version = models.ForeignKey(DocumentVersion, verbose_name=_('Document version'))
|
||||
datetime_submitted = models.DateTimeField(verbose_name=_('Date time submitted'), auto_now=True, db_index=True)
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||
|
||||
class Meta:
|
||||
verbose_name = _(u'Document queue')
|
||||
verbose_name_plural = _(u'Document queues')
|
||||
|
||||
def __unicode__(self):
|
||||
return self.label
|
||||
|
||||
|
||||
class QueueDocument(models.Model):
|
||||
document_queue = models.ForeignKey(DocumentQueue, related_name='documents', verbose_name=_(u'Document queue'))
|
||||
document = models.ForeignKey(Document, verbose_name=_(u'Document'))
|
||||
datetime_submitted = models.DateTimeField(verbose_name=_(u'Date time submitted'), auto_now=True, db_index=True)
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_(u'Result'))
|
||||
node_name = models.CharField(max_length=256, verbose_name=_(u'Node name'), blank=True, null=True)
|
||||
def __str__(self):
|
||||
return unicode(self.document_version)
|
||||
|
||||
class Meta:
|
||||
ordering = ('datetime_submitted',)
|
||||
verbose_name = _(u'Queue document')
|
||||
verbose_name_plural = _(u'Queue documents')
|
||||
|
||||
def __unicode__(self):
|
||||
try:
|
||||
return unicode(self.document)
|
||||
except ObjectDoesNotExist:
|
||||
return ugettext(u'Missing document.')
|
||||
verbose_name = _('Document Version OCR Error')
|
||||
verbose_name_plural = _('Document Version OCR Errors')
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
import slate
|
||||
@@ -90,7 +92,7 @@ class SlateParser(Parser):
|
||||
raise ParserError
|
||||
|
||||
document_page.content = pdf_pages[document_page.page_number - 1]
|
||||
document_page.page_label = _(u'Text extracted from PDF')
|
||||
document_page.page_label = _('Text extracted from PDF')
|
||||
document_page.save()
|
||||
|
||||
|
||||
@@ -112,7 +114,7 @@ class OfficeParser(Parser):
|
||||
|
||||
# Now that the office document has been converted to PDF
|
||||
# call the coresponding PDF parser in this new file
|
||||
parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf')
|
||||
parse_document_page(document_page, descriptor=open(input_filepath), mimetype='application/pdf')
|
||||
else:
|
||||
raise ParserError
|
||||
|
||||
@@ -126,7 +128,7 @@ class PopplerParser(Parser):
|
||||
PDF parser using the pdftotext execute from the poppler package
|
||||
"""
|
||||
def __init__(self):
|
||||
self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else u'/usr/bin/pdftotext'
|
||||
self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else '/usr/bin/pdftotext'
|
||||
if not os.path.exists(self.pdftotext_path):
|
||||
raise ParserError('cannot find pdftotext executable')
|
||||
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
||||
@@ -167,9 +169,9 @@ class PopplerParser(Parser):
|
||||
raise ParserError('No output')
|
||||
|
||||
document_page.content = output
|
||||
document_page.page_label = _(u'Text extracted from PDF')
|
||||
document_page.page_label = _('Text extracted from PDF')
|
||||
document_page.save()
|
||||
|
||||
|
||||
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
|
||||
register_parser(mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser])
|
||||
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from permissions.models import Permission, PermissionNamespace
|
||||
|
||||
ocr_namespace = PermissionNamespace('ocr', _(u'OCR'))
|
||||
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _(u'Submit documents for OCR'))
|
||||
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _(u'Delete documents from OCR queue'))
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _(u'Can execute the OCR clean up on all document pages'))
|
||||
ocr_namespace = PermissionNamespace('ocr', _('OCR'))
|
||||
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR'))
|
||||
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue'))
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages'))
|
||||
|
||||
@@ -1,5 +1,3 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from common.utils import load_backend
|
||||
|
||||
from .settings import BACKEND
|
||||
|
||||
@@ -1,7 +1,5 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from rest_framework import serializers
|
||||
|
||||
|
||||
class DocumentOCRSerializer(serializers.Serializer):
|
||||
document_id = serializers.IntegerField()
|
||||
class DocumentVersionOCRSerializer(serializers.Serializer):
|
||||
document_version_id = serializers.IntegerField()
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
"""Configuration options for the ocr app"""
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from smart_settings.api import register_settings
|
||||
|
||||
register_settings(
|
||||
namespace=u'ocr',
|
||||
module=u'ocr.settings',
|
||||
namespace='ocr',
|
||||
module='ocr.settings',
|
||||
settings=[
|
||||
{'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True},
|
||||
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
|
||||
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
|
||||
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
|
||||
{'name': 'TESSERACT_PATH', 'global_name': 'OCR_TESSERACT_PATH', 'default': '/usr/bin/tesseract', 'exists': True},
|
||||
{'name': 'UNPAPER_PATH', 'global_name': 'OCR_UNPAPER_PATH', 'default': '/usr/bin/unpaper', 'description': _('File path to unpaper program.'), 'exists': True},
|
||||
{'name': 'PDFTOTEXT_PATH', 'global_name': 'OCR_PDFTOTEXT_PATH', 'default': '/usr/bin/pdftotext', 'description': _('File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
|
||||
{'name': 'BACKEND', 'global_name': 'OCR_BACKEND', 'default': 'ocr.backends.tesseract.Tesseract', 'description': _('Full path to the backend to be used to do OCR.')},
|
||||
]
|
||||
)
|
||||
|
||||
@@ -0,0 +1,88 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from south.utils import datetime_utils as datetime
|
||||
from south.db import db
|
||||
from south.v2 import SchemaMigration
|
||||
from django.db import models
|
||||
|
||||
|
||||
class Migration(SchemaMigration):
|
||||
|
||||
def forwards(self, orm):
|
||||
# Deleting model 'DocumentQueue'
|
||||
db.delete_table(u'ocr_documentqueue')
|
||||
|
||||
# Deleting model 'QueueDocument'
|
||||
db.delete_table(u'ocr_queuedocument')
|
||||
|
||||
# Adding model 'DocumentVersionOCRError'
|
||||
db.create_table(u'ocr_documentversionocrerror', (
|
||||
(u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
|
||||
('document_version', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.DocumentVersion'])),
|
||||
('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, db_index=True, blank=True)),
|
||||
('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)),
|
||||
))
|
||||
db.send_create_signal(u'ocr', ['DocumentVersionOCRError'])
|
||||
|
||||
|
||||
def backwards(self, orm):
|
||||
# Adding model 'DocumentQueue'
|
||||
db.create_table(u'ocr_documentqueue', (
|
||||
(u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
|
||||
('name', self.gf('django.db.models.fields.CharField')(max_length=64, unique=True)),
|
||||
('label', self.gf('django.db.models.fields.CharField')(max_length=64)),
|
||||
))
|
||||
db.send_create_signal(u'ocr', ['DocumentQueue'])
|
||||
|
||||
# Adding model 'QueueDocument'
|
||||
db.create_table(u'ocr_queuedocument', (
|
||||
('node_name', self.gf('django.db.models.fields.CharField')(max_length=256, null=True, blank=True)),
|
||||
('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)),
|
||||
('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, blank=True, db_index=True)),
|
||||
('document_queue', self.gf('django.db.models.fields.related.ForeignKey')(related_name='documents', to=orm['ocr.DocumentQueue'])),
|
||||
('document', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.Document'])),
|
||||
(u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
|
||||
))
|
||||
db.send_create_signal(u'ocr', ['QueueDocument'])
|
||||
|
||||
# Deleting model 'DocumentVersionOCRError'
|
||||
db.delete_table(u'ocr_documentversionocrerror')
|
||||
|
||||
|
||||
models = {
|
||||
u'documents.document': {
|
||||
'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'},
|
||||
'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
|
||||
'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
|
||||
'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}),
|
||||
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'label': ('django.db.models.fields.CharField', [], {'default': "u'Uninitialized document'", 'max_length': '255', 'db_index': 'True'}),
|
||||
'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}),
|
||||
'uuid': ('django.db.models.fields.CharField', [], {'default': "u'b5b498b5-ffe5-4b70-b8a6-6c875ed11bf2'", 'max_length': '48'})
|
||||
},
|
||||
u'documents.documenttype': {
|
||||
'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'},
|
||||
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}),
|
||||
'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'})
|
||||
},
|
||||
u'documents.documentversion': {
|
||||
'Meta': {'object_name': 'DocumentVersion'},
|
||||
'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
|
||||
'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
|
||||
'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}),
|
||||
'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}),
|
||||
'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}),
|
||||
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
|
||||
'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'})
|
||||
},
|
||||
u'ocr.documentversionocrerror': {
|
||||
'Meta': {'ordering': "('datetime_submitted',)", 'object_name': 'DocumentVersionOCRError'},
|
||||
'datetime_submitted': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}),
|
||||
'document_version': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentVersion']"}),
|
||||
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
|
||||
'result': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
|
||||
}
|
||||
}
|
||||
|
||||
complete_apps = ['ocr']
|
||||
@@ -1,65 +1,61 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import platform
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from documents.models import Document
|
||||
from documents.models import DocumentVersion
|
||||
from lock_manager import Lock, LockError
|
||||
from mayan.celery import app
|
||||
|
||||
from .api import do_document_ocr
|
||||
from .literals import LOCK_EXPIRE
|
||||
from .models import DocumentQueue, QueueDocument
|
||||
from .models import DocumentVersionOCRError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@app.task(ignore_result=True)
|
||||
def task_do_ocr(document_pk):
|
||||
lock_id = u'task_do_ocr_doc-%d' % document_pk
|
||||
def task_do_ocr(document_version_pk):
|
||||
lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk
|
||||
try:
|
||||
logger.debug('trying to acquire lock: %s', lock_id)
|
||||
# Acquire lock to avoid doing OCR on the same document more than once
|
||||
# concurrently
|
||||
# Acquire lock to avoid doing OCR on the same document version more than
|
||||
# once concurrently
|
||||
lock = Lock.acquire_lock(lock_id, LOCK_EXPIRE)
|
||||
logger.debug('acquired lock: %s', lock_id)
|
||||
document = None
|
||||
document_version = None
|
||||
try:
|
||||
logger.info('Starting document OCR for document: %d', document_pk)
|
||||
document = Document.objects.get(pk=document_pk)
|
||||
do_document_ocr(document)
|
||||
logger.info('Starting document OCR for document version: %d', document_version_pk)
|
||||
document_version = DocumentVersion.objects.get(pk=document_version_pk)
|
||||
do_document_ocr(document_version)
|
||||
except Exception as exception:
|
||||
logger.error('OCR error for document: %d; %s', document_pk, exception)
|
||||
document_queue = DocumentQueue.objects.get(name='default')
|
||||
if document:
|
||||
queue_document, created = document_queue.documents.get_or_create(document=document)
|
||||
queue_document.node_name = platform.node()
|
||||
logger.error('OCR error for document version: %d; %s', document_version_pk, exception)
|
||||
if document_version:
|
||||
entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version)
|
||||
|
||||
if settings.DEBUG:
|
||||
result = []
|
||||
type, value, tb = sys.exc_info()
|
||||
result.append('%s: %s' % (type.__name__, value))
|
||||
result.extend(traceback.format_tb(tb))
|
||||
queue_document.result = '\n'.join(result)
|
||||
entry.result = '\n'.join(result)
|
||||
else:
|
||||
queue_document.result = exception
|
||||
entry.result = exception
|
||||
|
||||
queue_document.save()
|
||||
entry.save()
|
||||
else:
|
||||
logger.info('OCR for document: %d ended', document_pk)
|
||||
document_queue = DocumentQueue.objects.get(name='default')
|
||||
logger.info('OCR for document: %d ended', document_version_pk)
|
||||
try:
|
||||
queue_document = document_queue.documents.get(document=document)
|
||||
except QueueDocument.DoesNotExist:
|
||||
entry = DocumentVersionOCRError.objects.get(document_version=document_version)
|
||||
except DocumentVersionOCRError.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
queue_document.delete()
|
||||
entry.delete()
|
||||
finally:
|
||||
lock.release()
|
||||
except LockError:
|
||||
logger.debug('unable to obtain lock')
|
||||
logger.debug('unable to obtain lock: %s' % lock_id)
|
||||
pass
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
from django.test import TransactionTestCase
|
||||
@@ -6,8 +6,6 @@ from django.test import TransactionTestCase
|
||||
from documents.models import Document, DocumentType
|
||||
from documents.tests import TEST_SMALL_DOCUMENT_PATH, TEST_DOCUMENT_TYPE
|
||||
|
||||
from .models import DocumentQueue, QueueDocument
|
||||
|
||||
|
||||
class DocumentOCRTestCase(TransactionTestCase):
|
||||
def setUp(self):
|
||||
@@ -16,11 +14,6 @@ class DocumentOCRTestCase(TransactionTestCase):
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = Document.objects.new_document(file_object=File(file_object), document_type=self.document_type)[0].document
|
||||
|
||||
DocumentQueue.objects.get_or_create(name='default')
|
||||
|
||||
# Clear OCR queue
|
||||
QueueDocument.objects.all().delete()
|
||||
|
||||
def _test_ocr_language_issue_16(self, language, result):
|
||||
"""
|
||||
Reusable OCR test for a specific language
|
||||
|
||||
@@ -1,19 +1,21 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.conf.urls import patterns, url
|
||||
|
||||
from .api_views import DocumentOCRView
|
||||
from .api_views import DocumentVersionOCRView
|
||||
|
||||
urlpatterns = patterns('ocr.views',
|
||||
url(r'^document/(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
|
||||
url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'),
|
||||
url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
|
||||
url(r'^queue/document/(?P<queue_document_id>\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'),
|
||||
url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'),
|
||||
url(r'^queue/document/(?P<queue_document_id>\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'),
|
||||
url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'),
|
||||
url(r'^document/(?P<pk>\d+)/submit/$', 'document_submit', (), 'document_submit'),
|
||||
url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'),
|
||||
url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'),
|
||||
|
||||
url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
|
||||
url(r'^all/$', 'entry_list', (), 'entry_list'),
|
||||
url(r'^(?P<pk>\d+)/delete/$', 'entry_delete', (), 'entry_delete'),
|
||||
url(r'^multiple/delete/$', 'entry_delete_multiple', (), 'entry_delete_multiple'),
|
||||
url(r'^(?P<pk>\d+)/re-queue/$', 'entry_re_queue', (), 'entry_re_queue'),
|
||||
url(r'^multiple/re-queue/$', 'entry_re_queue_multiple', (), 'entry_re_queue_multiple'),
|
||||
)
|
||||
|
||||
api_urls = patterns('',
|
||||
url(r'^submit/$', DocumentOCRView.as_view(), name='document-ocr-submit-view'),
|
||||
url(r'^submit/$', DocumentVersionOCRView.as_view(), name='document-version-ocr-submit-view'),
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import absolute_import
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.contrib import messages
|
||||
from django.core.exceptions import PermissionDenied
|
||||
@@ -6,173 +6,43 @@ from django.core.urlresolvers import reverse
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.shortcuts import get_object_or_404, render_to_response
|
||||
from django.template import RequestContext
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext_lazy as _, ungettext
|
||||
|
||||
from acls.models import AccessEntry
|
||||
from common.utils import encapsulate
|
||||
from documents.models import Document
|
||||
from documents.widgets import document_link, document_thumbnail
|
||||
from documents.models import Document, DocumentVersion
|
||||
from permissions.models import Permission
|
||||
|
||||
from .api import clean_pages
|
||||
from .models import DocumentQueue, QueueDocument
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES,
|
||||
PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE)
|
||||
|
||||
|
||||
def queue_document_list(request, queue_name='default'):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
|
||||
|
||||
context = {
|
||||
'object_list': document_queue.documents.all(),
|
||||
'title': _(u'Documents in queue: %s') % document_queue,
|
||||
'hide_object': True,
|
||||
'queue': document_queue,
|
||||
'navigation_object_name': 'queue',
|
||||
'list_object_variable_name': 'queue_document',
|
||||
'extra_columns': [
|
||||
{'name': _('Document'), 'attribute': encapsulate(lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.'))},
|
||||
{'name': _(u'Thumbnail'), 'attribute': encapsulate(lambda x: document_thumbnail(x.document))},
|
||||
{'name': _('Added'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True},
|
||||
{'name': _('Node'), 'attribute': 'node_name'},
|
||||
{'name': _('Result'), 'attribute': 'result'},
|
||||
],
|
||||
}
|
||||
|
||||
return render_to_response('main/generic_list.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def queue_document_delete(request, queue_document_id=None, queue_document_id_list=None):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE])
|
||||
|
||||
if queue_document_id:
|
||||
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)]
|
||||
elif queue_document_id_list:
|
||||
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')]
|
||||
else:
|
||||
messages.error(request, _(u'Must provide at least one queue document.'))
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method == 'POST':
|
||||
for queue_document in queue_documents:
|
||||
try:
|
||||
queue_document.delete()
|
||||
messages.success(request, _(u'Queue document: %(document)s deleted successfully.') % {
|
||||
'document': queue_document.document})
|
||||
|
||||
except Exception as exception:
|
||||
messages.error(request, _(u'Error deleting document: %(document)s; %(error)s') % {
|
||||
'document': queue_document, 'error': exception})
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
context = {
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
'delete_view': True,
|
||||
}
|
||||
|
||||
if len(queue_documents) == 1:
|
||||
context['object'] = queue_documents[0]
|
||||
context['title'] = _(u'Are you sure you wish to delete queue document: %s?') % ', '.join([unicode(d) for d in queue_documents])
|
||||
elif len(queue_documents) > 1:
|
||||
context['title'] = _(u'Are you sure you wish to delete queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents])
|
||||
|
||||
return render_to_response('main/generic_confirm.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def queue_document_multiple_delete(request):
|
||||
return queue_document_delete(request, queue_document_id_list=request.GET.get('id_list', ''))
|
||||
|
||||
|
||||
def submit_document_multiple(request):
|
||||
for item_id in request.GET.get('id_list', '').split(','):
|
||||
submit_document(request, item_id)
|
||||
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
|
||||
def submit_document(request, document_id):
|
||||
document = get_object_or_404(Document, pk=document_id)
|
||||
def document_submit(request, pk):
|
||||
document = get_object_or_404(Document, pk=pk)
|
||||
|
||||
try:
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
except PermissionDenied:
|
||||
AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document)
|
||||
|
||||
return submit_document_to_queue(request, document=document,
|
||||
post_submit_redirect=request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
|
||||
def submit_document_to_queue(request, document, post_submit_redirect=None):
|
||||
"""
|
||||
This view is meant to be reusable
|
||||
"""
|
||||
|
||||
document.submit_for_ocr()
|
||||
messages.success(request, _(u'Document: %(document)s was added to the OCR queue.') % {
|
||||
messages.success(request, _('Document: %(document)s was added to the OCR queue.') % {
|
||||
'document': document}
|
||||
)
|
||||
|
||||
if post_submit_redirect:
|
||||
return HttpResponseRedirect(post_submit_redirect)
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
|
||||
def re_queue_document(request, queue_document_id=None, queue_document_id_list=None):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
def document_submit_multiple(request):
|
||||
for item_id in request.GET.get('id_list', '').split(','):
|
||||
document_submit(request, item_id)
|
||||
|
||||
if queue_document_id:
|
||||
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)]
|
||||
elif queue_document_id_list:
|
||||
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')]
|
||||
else:
|
||||
messages.error(request, _(u'Must provide at least one queue document.'))
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method == 'POST':
|
||||
for queue_document in queue_documents:
|
||||
try:
|
||||
queue_document.document.submit_for_ocr()
|
||||
messages.success(
|
||||
request,
|
||||
_(u'Document: %(document)s was re-queued for OCR.') % {
|
||||
'document': queue_document.document
|
||||
}
|
||||
)
|
||||
except Document.DoesNotExist:
|
||||
messages.error(request, _(u'Document id#: %d, no longer exists.') % queue_document.document_id)
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
context = {
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
}
|
||||
|
||||
if len(queue_documents) == 1:
|
||||
context['object'] = queue_documents[0]
|
||||
context['title'] = _(u'Are you sure you wish to re-queue document: %s?') % ', '.join([unicode(d) for d in queue_documents])
|
||||
elif len(queue_documents) > 1:
|
||||
context['title'] = _(u'Are you sure you wish to re-queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents])
|
||||
|
||||
return render_to_response('main/generic_confirm.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
|
||||
def re_queue_multiple_document(request):
|
||||
return re_queue_document(request, queue_document_id_list=request.GET.get('id_list', []))
|
||||
|
||||
|
||||
def all_document_ocr_cleanup(request):
|
||||
def document_all_ocr_cleanup(request):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES])
|
||||
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
@@ -182,27 +52,133 @@ def all_document_ocr_cleanup(request):
|
||||
return render_to_response('main/generic_confirm.html', {
|
||||
'previous': previous,
|
||||
'next': next,
|
||||
'title': _(u'Are you sure you wish to clean up all the pages content?'),
|
||||
'message': _(u'On large databases this operation may take some time to execute.'),
|
||||
'title': _('Are you sure you wish to clean up all the pages content?'),
|
||||
'message': _('On large databases this operation may take some time to execute.'),
|
||||
}, context_instance=RequestContext(request))
|
||||
else:
|
||||
try:
|
||||
# TODO: turn this into a Celery task
|
||||
clean_pages()
|
||||
messages.success(request, _(u'Document pages content clean up complete.'))
|
||||
messages.success(request, _('Document pages content clean up complete.'))
|
||||
except Exception as exception:
|
||||
messages.error(request, _(u'Document pages content clean up error: %s') % exception)
|
||||
messages.error(request, _('Document pages content clean up error: %s') % exception)
|
||||
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
|
||||
def display_link(obj):
|
||||
output = []
|
||||
if hasattr(obj, 'get_absolute_url'):
|
||||
output.append(u'<a href="%(url)s">%(obj)s</a>' % {
|
||||
'url': obj.get_absolute_url(),
|
||||
'obj': obj
|
||||
})
|
||||
if output:
|
||||
return u''.join(output)
|
||||
def entry_list(request):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
context = {
|
||||
'object_list': DocumentVersionOCRError.objects.all(),
|
||||
'title': _('OCR errors'),
|
||||
'hide_object': True,
|
||||
}
|
||||
|
||||
return render_to_response('main/generic_list.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def entry_delete(request, pk=None, pk_list=None):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE])
|
||||
|
||||
if pk:
|
||||
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)]
|
||||
elif pk_list:
|
||||
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')]
|
||||
else:
|
||||
return obj
|
||||
messages.error(request, _('Make at least one selection.'))
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method == 'POST':
|
||||
for entry in entries:
|
||||
try:
|
||||
entry.delete()
|
||||
messages.success(request, _('Entry: %(entry)s deleted successfully.') % {
|
||||
'entry': entry})
|
||||
|
||||
except Exception as exception:
|
||||
messages.error(request, _('Error entry: %(entry)s; %(error)s') % {
|
||||
'entry': entry, 'error': exception})
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
context = {
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
'delete_view': True,
|
||||
}
|
||||
|
||||
if len(entries) == 1:
|
||||
context['object'] = entries[0]
|
||||
|
||||
context['title'] = ungettext(
|
||||
'Are you sure you wish to delete the entry: %(entry)s?',
|
||||
'Are you sure you wish to delete these %(count)d entries.',
|
||||
len(entries)
|
||||
) % {
|
||||
'count': len(entries),
|
||||
'entry': entries[0],
|
||||
}
|
||||
|
||||
return render_to_response('main/generic_confirm.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def entry_delete_multiple(request):
|
||||
return entry_delete(request, pk_list=request.GET.get('id_list', ''))
|
||||
|
||||
|
||||
def entry_re_queue(request, pk=None, pk_list=None):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
if pk:
|
||||
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)]
|
||||
elif pk_list:
|
||||
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')]
|
||||
else:
|
||||
messages.error(request, _('Make at least one selection.'))
|
||||
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
|
||||
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method == 'POST':
|
||||
for entry in entries:
|
||||
try:
|
||||
entry.document_version.submit_for_ocr()
|
||||
messages.success(
|
||||
request,
|
||||
_('Entry: %(entry)s was re-queued for OCR.') % {
|
||||
'entry': entry
|
||||
}
|
||||
)
|
||||
except DocumentVersion.DoesNotExist:
|
||||
messages.error(request, _('Document version id#: %d, no longer exists.') % entry.document_version_id)
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
context = {
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
}
|
||||
|
||||
if len(entries) == 1:
|
||||
context['object'] = entries[0]
|
||||
|
||||
context['title'] = ungettext(
|
||||
'Are you sure you wish to re-queue the entry: %(entry)s?',
|
||||
'Are you sure you wish to re-queue these %(count)d entries.',
|
||||
len(entries)
|
||||
) % {
|
||||
'count': len(entries),
|
||||
'entry': entries[0],
|
||||
}
|
||||
|
||||
return render_to_response('main/generic_confirm.html', context,
|
||||
context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def entry_re_queue_multiple(request):
|
||||
return entry_re_queue(request, pk_list=request.GET.get('id_list', []))
|
||||
|
||||
Reference in New Issue
Block a user