From e6754c9a6f036441c26bee077ce36c8c72353ec8 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 03:01:43 -0400 Subject: [PATCH] Update the OCR app to work based on document versions not documents, document version are the module which hold the document pages instances. Remove old OCR document queue and replace with a single module for OCR processing error entries. Increase compatibility with Django 1.7 and Python 3. --- mayan/apps/ocr/__init__.py | 60 ++-- mayan/apps/ocr/admin.py | 19 +- mayan/apps/ocr/api.py | 18 +- mayan/apps/ocr/api_views.py | 18 +- mayan/apps/ocr/backends/__init__.py | 2 +- mayan/apps/ocr/backends/tesseract.py | 6 +- mayan/apps/ocr/exceptions.py | 3 + mayan/apps/ocr/lang/deu.py | 2 +- mayan/apps/ocr/lang/eng.py | 2 +- mayan/apps/ocr/lang/rus.py | 2 +- mayan/apps/ocr/lang/spa.py | 2 +- mayan/apps/ocr/links.py | 19 +- mayan/apps/ocr/literals.py | 8 +- mayan/apps/ocr/models.py | 41 +-- mayan/apps/ocr/parsers/__init__.py | 12 +- mayan/apps/ocr/permissions.py | 10 +- mayan/apps/ocr/runtime.py | 2 - mayan/apps/ocr/serializers.py | 6 +- mayan/apps/ocr/settings.py | 14 +- ...euedocument__add_documentversionocrerro.py | 88 ++++++ mayan/apps/ocr/tasks.py | 48 ++- mayan/apps/ocr/tests.py | 9 +- mayan/apps/ocr/urls.py | 22 +- mayan/apps/ocr/views.py | 290 ++++++++---------- 24 files changed, 375 insertions(+), 328 deletions(-) create mode 100644 mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py diff --git a/mayan/apps/ocr/__init__.py b/mayan/apps/ocr/__init__.py index c4be0a4539..4f4e79d184 100644 --- a/mayan/apps/ocr/__init__.py +++ b/mayan/apps/ocr/__init__.py @@ -1,61 +1,75 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import logging from django.dispatch import receiver from django.utils.translation import ugettext_lazy as _ -from south.signals import post_migrate - from acls.api import class_permissions +from common.utils import encapsulate from documents.models import Document, DocumentVersion from documents.signals import post_version_upload +from documents.widgets import document_link from main.api import register_maintenance_links -from navigation.api import register_links +from navigation.api import register_links, register_model_list_columns from navigation.links import link_spacer from project_tools.api import register_tool from rest_api.classes import APIEndPoint -from .links import (all_document_ocr_cleanup, ocr_tool_link, - queue_document_list, queue_document_multiple_delete, - re_queue_multiple_document, submit_document, - submit_document_multiple) -from .models import DocumentQueue +from .links import ( + link_document_all_ocr_cleanup, link_document_submit, + link_document_submit_multiple, link_entry_delete, + link_entry_delete_multiple, link_entry_list, link_entry_re_queue, + link_entry_re_queue_multiple +) +from .models import DocumentVersionOCRError from .permissions import PERMISSION_OCR_DOCUMENT from .tasks import task_do_ocr logger = logging.getLogger(__name__) -register_links(Document, [submit_document]) -register_links([Document], [submit_document_multiple, link_spacer], menu_name='multi_item_links') -register_links(['ocr:queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete]) -register_links(['ocr:queue_document_list'], [queue_document_list], menu_name='secondary_menu') +register_links(Document, [link_document_submit]) +register_links([Document], [link_document_submit_multiple, link_spacer], menu_name='multi_item_links') -register_maintenance_links([all_document_ocr_cleanup], namespace='ocr', title=_(u'OCR')) +register_links([DocumentVersionOCRError], [link_entry_re_queue_multiple, link_entry_delete_multiple, link_spacer], menu_name='multi_item_links') +register_links([DocumentVersionOCRError], [link_entry_re_queue, link_entry_delete]) +register_links(['ocr:entry_list', 'ocr:entry_delete_multiple', 'ocr:entry_re_queue_multiple', DocumentVersionOCRError], [link_entry_list], menu_name='secondary_menu') +register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR')) def document_ocr_submit(self): task_do_ocr.apply_async(args=[self.pk], queue='ocr') +def document_version_ocr_submit(self): + task_do_ocr.apply_async(args=[self.document.pk], queue='ocr') + + @receiver(post_version_upload, dispatch_uid='post_version_upload_ocr', sender=DocumentVersion) def post_version_upload_ocr(sender, instance, **kwargs): logger.debug('received post_version_upload') - logger.debug('instance.document: %s', instance.document) + logger.debug('instance pk: %s', instance.pk) if instance.document.document_type.ocr: - instance.document.submit_for_ocr() - - -@receiver(post_migrate, dispatch_uid='create_default_queue') -def create_default_queue_signal_handler(sender, **kwargs): - if kwargs['app'] == 'ocr': - DocumentQueue.objects.get_or_create(name='default') + instance.submit_for_ocr() Document.add_to_class('submit_for_ocr', document_ocr_submit) +DocumentVersion.add_to_class('submit_for_ocr', document_version_ocr_submit) class_permissions(Document, [PERMISSION_OCR_DOCUMENT]) -register_tool(ocr_tool_link) +register_tool(link_entry_list) APIEndPoint('ocr') + +register_model_list_columns(DocumentVersionOCRError, [ + { + 'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document)) + }, + { + 'name': _('Added'), 'attribute': 'datetime_submitted' + }, + { + 'name': _('Result'), 'attribute': 'result' + }, +]) diff --git a/mayan/apps/ocr/admin.py b/mayan/apps/ocr/admin.py index 7cae1df462..cd434cc85f 100644 --- a/mayan/apps/ocr/admin.py +++ b/mayan/apps/ocr/admin.py @@ -1,20 +1,13 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.contrib import admin -from .models import DocumentQueue, QueueDocument +from .models import DocumentVersionOCRError -class QueueDocumentInline(admin.StackedInline): - model = QueueDocument - extra = 1 - classes = ('collapse-open',) - allow_add = True +class DocumentVersionOCRErrorAdmin(admin.ModelAdmin): + list_display = ('document_version', 'datetime_submitted') + readonly_fields = ('document_version', 'datetime_submitted', 'result') -class DocumentQueueAdmin(admin.ModelAdmin): - inlines = [QueueDocumentInline] - list_display = ('name', 'label') - - -admin.site.register(DocumentQueue, DocumentQueueAdmin) +admin.site.register(DocumentVersionOCRError, DocumentVersionOCRErrorAdmin) diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index f0189a09df..b5e772019a 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import logging import os @@ -30,14 +30,14 @@ except sh.CommandNotFound: UNPAPER = None -def do_document_ocr(document): +def do_document_ocr(document_version): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling the corresponding OCR backend """ - for document_page in document.pages.all(): + for document_page in document_version.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) @@ -68,10 +68,10 @@ def do_document_ocr(document): os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: - ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document.language) + ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language) - document_page.content = ocr_cleanup(document.language, ocr_text) - document_page.page_label = _(u'Text from OCR') + document_page.content = ocr_cleanup(document_version.document.language, ocr_text) + document_page.page_label = _('Text from OCR') document_page.save() finally: fs_cleanup(pre_ocr_filepath_w_ext) @@ -86,7 +86,7 @@ def ocr_cleanup(language, text): cleanup filter """ try: - language_backend = load_backend(u'.'.join([u'ocr', u'lang', language, u'LanguageBackend']))() + language_backend = load_backend('.'.join(['ocr', 'lang', language, 'LanguageBackend']))() except ImportError: language_backend = None @@ -104,9 +104,9 @@ def ocr_cleanup(language, text): result = word if result: output.append(result) - output.append(u'\n') + output.append('\n') - return u' '.join(output) + return ' '.join(output) def clean_pages(): diff --git a/mayan/apps/ocr/api_views.py b/mayan/apps/ocr/api_views.py index 220339e54f..75d5bb338a 100644 --- a/mayan/apps/ocr/api_views.py +++ b/mayan/apps/ocr/api_views.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals from django.core.exceptions import PermissionDenied from django.shortcuts import get_object_or_404 @@ -8,33 +8,33 @@ from rest_framework.response import Response from rest_framework.settings import api_settings from acls.models import AccessEntry -from documents.models import Document +from documents.models import DocumentVersion from permissions.models import Permission from rest_api.permissions import MayanPermission from .permissions import PERMISSION_OCR_DOCUMENT -from .serializers import DocumentOCRSerializer +from .serializers import DocumentVersionOCRSerializer -class DocumentOCRView(generics.GenericAPIView): - serializer_class = DocumentOCRSerializer +class DocumentVersionOCRView(generics.GenericAPIView): + serializer_class = DocumentVersionOCRSerializer permission_classes = (MayanPermission,) def post(self, request, *args, **kwargs): - """Submit document OCR queue.""" + """Submit document version for OCR.""" serializer = self.get_serializer(data=request.DATA, files=request.FILES) if serializer.is_valid(): - document = get_object_or_404(Document, pk=serializer.data['document_id']) + document_version = get_object_or_404(DocumentVersion, pk=serializer.data['document_version_id']) try: Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) except PermissionDenied: - AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document) + AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document_version.document) - document.submit_for_ocr() + document_version.submit_for_ocr() headers = self.get_success_headers(serializer.data) return Response(serializer.data, status=status.HTTP_202_ACCEPTED, diff --git a/mayan/apps/ocr/backends/__init__.py b/mayan/apps/ocr/backends/__init__.py index f6e245ceb8..6558a75c85 100644 --- a/mayan/apps/ocr/backends/__init__.py +++ b/mayan/apps/ocr/backends/__init__.py @@ -1,3 +1,3 @@ class BackendBase(object): - def execute(self, input_filename, language=None): # NOQA + def execute(self, input_filename, language=None): raise NotImplementedError diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py index f0c34477c0..e36b4c043a 100644 --- a/mayan/apps/ocr/backends/tesseract.py +++ b/mayan/apps/ocr/backends/tesseract.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import codecs import errno @@ -20,11 +20,11 @@ class Tesseract(BackendBase): """ fd, filepath = tempfile.mkstemp() os.close(fd) - ocr_output = os.extsep.join([filepath, u'txt']) + ocr_output = os.extsep.join([filepath, 'txt']) command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] if language is not None: - command.extend([u'-l', language]) + command.extend(['-l', language]) try: proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) diff --git a/mayan/apps/ocr/exceptions.py b/mayan/apps/ocr/exceptions.py index 5497c92ea5..123f52160f 100644 --- a/mayan/apps/ocr/exceptions.py +++ b/mayan/apps/ocr/exceptions.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + + class OCRError(Exception): """ Raised by the OCR backend diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py index a3ca0383e9..ccff3eba7d 100644 --- a/mayan/apps/ocr/lang/deu.py +++ b/mayan/apps/ocr/lang/deu.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/lang/eng.py b/mayan/apps/ocr/lang/eng.py index 29dc3384e8..5025db136d 100644 --- a/mayan/apps/ocr/lang/eng.py +++ b/mayan/apps/ocr/lang/eng.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/lang/rus.py b/mayan/apps/ocr/lang/rus.py index 05ce0e1ab1..e7b7588358 100644 --- a/mayan/apps/ocr/lang/rus.py +++ b/mayan/apps/ocr/lang/rus.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/lang/spa.py b/mayan/apps/ocr/lang/spa.py index eb4d9ead45..c736a69b9a 100644 --- a/mayan/apps/ocr/lang/spa.py +++ b/mayan/apps/ocr/lang/spa.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/links.py b/mayan/apps/ocr/links.py index eb41e2b9d7..f41743f121 100644 --- a/mayan/apps/ocr/links.py +++ b/mayan/apps/ocr/links.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.utils.translation import ugettext_lazy as _ @@ -6,14 +6,13 @@ from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE) -submit_document = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -submit_document_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document_multiple', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -re_queue_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -re_queue_multiple_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_multiple_document', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -queue_document_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} -queue_document_multiple_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} +link_document_submit = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} +link_document_submit_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit_multiple', 'famfam': 'hourglass_add'} +link_entry_re_queue = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} +link_entry_re_queue_multiple = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue_multiple', 'famfam': 'hourglass_add'} +link_entry_delete = {'text': _('Delete'), 'view': 'ocr:entry_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} +link_entry_delete_multiple = {'text': _('Delete'), 'view': 'ocr:entry_delete_multiple', 'famfam': 'hourglass_delete'} -all_document_ocr_cleanup = {'text': _(u'Clean up pages content'), 'view': 'ocr:all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')} +link_document_all_ocr_cleanup = {'text': _('Clean up pages content'), 'view': 'ocr:document_all_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _('Runs a language filter to remove common OCR mistakes from document pages content.')} -queue_document_list = {'text': _(u'Queue document list'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]} -ocr_tool_link = {'text': _(u'OCR'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]} +link_entry_list = {'text': _('OCR Errors'), 'view': 'ocr:entry_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]} diff --git a/mayan/apps/ocr/literals.py b/mayan/apps/ocr/literals.py index 8e80534272..3a7b1360dc 100644 --- a/mayan/apps/ocr/literals.py +++ b/mayan/apps/ocr/literals.py @@ -1,4 +1,6 @@ -DEFAULT_OCR_FILE_FORMAT = u'tiff' -DEFAULT_OCR_FILE_EXTENSION = u'tif' +from __future__ import unicode_literals + +DEFAULT_OCR_FILE_FORMAT = 'tiff' +DEFAULT_OCR_FILE_EXTENSION = 'tif' LOCK_EXPIRE = 60 * 10 # Adjust to worst case scenario -UNPAPER_FILE_FORMAT = u'ppm' +UNPAPER_FILE_FORMAT = 'ppm' diff --git a/mayan/apps/ocr/models.py b/mayan/apps/ocr/models.py index 8533dcea2a..e4c1713eb9 100644 --- a/mayan/apps/ocr/models.py +++ b/mayan/apps/ocr/models.py @@ -1,39 +1,22 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.db import models -from django.core.exceptions import ObjectDoesNotExist -from django.utils.translation import ugettext +from django.utils.encoding import python_2_unicode_compatible from django.utils.translation import ugettext_lazy as _ -from documents.models import Document +from documents.models import DocumentVersion -class DocumentQueue(models.Model): - name = models.CharField(max_length=64, unique=True, verbose_name=_(u'Name')) - label = models.CharField(max_length=64, verbose_name=_(u'Label')) +@python_2_unicode_compatible +class DocumentVersionOCRError(models.Model): + document_version = models.ForeignKey(DocumentVersion, verbose_name=_('Document version')) + datetime_submitted = models.DateTimeField(verbose_name=_('Date time submitted'), auto_now=True, db_index=True) + result = models.TextField(blank=True, null=True, verbose_name=_('Result')) - class Meta: - verbose_name = _(u'Document queue') - verbose_name_plural = _(u'Document queues') - - def __unicode__(self): - return self.label - - -class QueueDocument(models.Model): - document_queue = models.ForeignKey(DocumentQueue, related_name='documents', verbose_name=_(u'Document queue')) - document = models.ForeignKey(Document, verbose_name=_(u'Document')) - datetime_submitted = models.DateTimeField(verbose_name=_(u'Date time submitted'), auto_now=True, db_index=True) - result = models.TextField(blank=True, null=True, verbose_name=_(u'Result')) - node_name = models.CharField(max_length=256, verbose_name=_(u'Node name'), blank=True, null=True) + def __str__(self): + return unicode(self.document_version) class Meta: ordering = ('datetime_submitted',) - verbose_name = _(u'Queue document') - verbose_name_plural = _(u'Queue documents') - - def __unicode__(self): - try: - return unicode(self.document) - except ObjectDoesNotExist: - return ugettext(u'Missing document.') + verbose_name = _('Document Version OCR Error') + verbose_name_plural = _('Document Version OCR Errors') diff --git a/mayan/apps/ocr/parsers/__init__.py b/mayan/apps/ocr/parsers/__init__.py index d599103b15..f505b3c531 100644 --- a/mayan/apps/ocr/parsers/__init__.py +++ b/mayan/apps/ocr/parsers/__init__.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import logging import os import slate @@ -90,7 +92,7 @@ class SlateParser(Parser): raise ParserError document_page.content = pdf_pages[document_page.page_number - 1] - document_page.page_label = _(u'Text extracted from PDF') + document_page.page_label = _('Text extracted from PDF') document_page.save() @@ -112,7 +114,7 @@ class OfficeParser(Parser): # Now that the office document has been converted to PDF # call the coresponding PDF parser in this new file - parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf') + parse_document_page(document_page, descriptor=open(input_filepath), mimetype='application/pdf') else: raise ParserError @@ -126,7 +128,7 @@ class PopplerParser(Parser): PDF parser using the pdftotext execute from the poppler package """ def __init__(self): - self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else u'/usr/bin/pdftotext' + self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else '/usr/bin/pdftotext' if not os.path.exists(self.pdftotext_path): raise ParserError('cannot find pdftotext executable') logger.debug('self.pdftotext_path: %s', self.pdftotext_path) @@ -167,9 +169,9 @@ class PopplerParser(Parser): raise ParserError('No output') document_page.content = output - document_page.page_label = _(u'Text extracted from PDF') + document_page.page_label = _('Text extracted from PDF') document_page.save() -register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser]) +register_parser(mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser]) register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser]) diff --git a/mayan/apps/ocr/permissions.py b/mayan/apps/ocr/permissions.py index e8dbc188e7..b6bf977a6c 100644 --- a/mayan/apps/ocr/permissions.py +++ b/mayan/apps/ocr/permissions.py @@ -1,10 +1,10 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals from django.utils.translation import ugettext_lazy as _ from permissions.models import Permission, PermissionNamespace -ocr_namespace = PermissionNamespace('ocr', _(u'OCR')) -PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _(u'Submit documents for OCR')) -PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _(u'Delete documents from OCR queue')) -PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _(u'Can execute the OCR clean up on all document pages')) +ocr_namespace = PermissionNamespace('ocr', _('OCR')) +PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR')) +PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue')) +PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages')) diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py index eef63478c0..78aef88077 100644 --- a/mayan/apps/ocr/runtime.py +++ b/mayan/apps/ocr/runtime.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from common.utils import load_backend from .settings import BACKEND diff --git a/mayan/apps/ocr/serializers.py b/mayan/apps/ocr/serializers.py index c38fb42f7c..9640050fb8 100644 --- a/mayan/apps/ocr/serializers.py +++ b/mayan/apps/ocr/serializers.py @@ -1,7 +1,5 @@ -from __future__ import absolute_import - from rest_framework import serializers -class DocumentOCRSerializer(serializers.Serializer): - document_id = serializers.IntegerField() +class DocumentVersionOCRSerializer(serializers.Serializer): + document_version_id = serializers.IntegerField() diff --git a/mayan/apps/ocr/settings.py b/mayan/apps/ocr/settings.py index ddfec29592..7bde2b13bb 100644 --- a/mayan/apps/ocr/settings.py +++ b/mayan/apps/ocr/settings.py @@ -1,16 +1,16 @@ -"""Configuration options for the ocr app""" +from __future__ import unicode_literals from django.utils.translation import ugettext_lazy as _ from smart_settings.api import register_settings register_settings( - namespace=u'ocr', - module=u'ocr.settings', + namespace='ocr', + module='ocr.settings', settings=[ - {'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True}, - {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, - {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, - {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')}, + {'name': 'TESSERACT_PATH', 'global_name': 'OCR_TESSERACT_PATH', 'default': '/usr/bin/tesseract', 'exists': True}, + {'name': 'UNPAPER_PATH', 'global_name': 'OCR_UNPAPER_PATH', 'default': '/usr/bin/unpaper', 'description': _('File path to unpaper program.'), 'exists': True}, + {'name': 'PDFTOTEXT_PATH', 'global_name': 'OCR_PDFTOTEXT_PATH', 'default': '/usr/bin/pdftotext', 'description': _('File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, + {'name': 'BACKEND', 'global_name': 'OCR_BACKEND', 'default': 'ocr.backends.tesseract.Tesseract', 'description': _('Full path to the backend to be used to do OCR.')}, ] ) diff --git a/mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py b/mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py new file mode 100644 index 0000000000..ff4777e7ca --- /dev/null +++ b/mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from south.utils import datetime_utils as datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + + +class Migration(SchemaMigration): + + def forwards(self, orm): + # Deleting model 'DocumentQueue' + db.delete_table(u'ocr_documentqueue') + + # Deleting model 'QueueDocument' + db.delete_table(u'ocr_queuedocument') + + # Adding model 'DocumentVersionOCRError' + db.create_table(u'ocr_documentversionocrerror', ( + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + ('document_version', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.DocumentVersion'])), + ('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, db_index=True, blank=True)), + ('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)), + )) + db.send_create_signal(u'ocr', ['DocumentVersionOCRError']) + + + def backwards(self, orm): + # Adding model 'DocumentQueue' + db.create_table(u'ocr_documentqueue', ( + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + ('name', self.gf('django.db.models.fields.CharField')(max_length=64, unique=True)), + ('label', self.gf('django.db.models.fields.CharField')(max_length=64)), + )) + db.send_create_signal(u'ocr', ['DocumentQueue']) + + # Adding model 'QueueDocument' + db.create_table(u'ocr_queuedocument', ( + ('node_name', self.gf('django.db.models.fields.CharField')(max_length=256, null=True, blank=True)), + ('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)), + ('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, blank=True, db_index=True)), + ('document_queue', self.gf('django.db.models.fields.related.ForeignKey')(related_name='documents', to=orm['ocr.DocumentQueue'])), + ('document', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.Document'])), + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + )) + db.send_create_signal(u'ocr', ['QueueDocument']) + + # Deleting model 'DocumentVersionOCRError' + db.delete_table(u'ocr_documentversionocrerror') + + + models = { + u'documents.document': { + 'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'}, + 'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'label': ('django.db.models.fields.CharField', [], {'default': "u'Uninitialized document'", 'max_length': '255', 'db_index': 'True'}), + 'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}), + 'uuid': ('django.db.models.fields.CharField', [], {'default': "u'b5b498b5-ffe5-4b70-b8a6-6c875ed11bf2'", 'max_length': '48'}) + }, + u'documents.documenttype': { + 'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}), + 'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'}) + }, + u'documents.documentversion': { + 'Meta': {'object_name': 'DocumentVersion'}, + 'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}), + 'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}), + 'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}) + }, + u'ocr.documentversionocrerror': { + 'Meta': {'ordering': "('datetime_submitted',)", 'object_name': 'DocumentVersionOCRError'}, + 'datetime_submitted': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}), + 'document_version': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentVersion']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'result': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}) + } + } + + complete_apps = ['ocr'] \ No newline at end of file diff --git a/mayan/apps/ocr/tasks.py b/mayan/apps/ocr/tasks.py index 0f506e103f..a7bebddc68 100644 --- a/mayan/apps/ocr/tasks.py +++ b/mayan/apps/ocr/tasks.py @@ -1,65 +1,61 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import logging -import platform import sys import traceback from django.conf import settings -from documents.models import Document +from documents.models import DocumentVersion from lock_manager import Lock, LockError from mayan.celery import app from .api import do_document_ocr from .literals import LOCK_EXPIRE -from .models import DocumentQueue, QueueDocument +from .models import DocumentVersionOCRError logger = logging.getLogger(__name__) @app.task(ignore_result=True) -def task_do_ocr(document_pk): - lock_id = u'task_do_ocr_doc-%d' % document_pk +def task_do_ocr(document_version_pk): + lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk try: logger.debug('trying to acquire lock: %s', lock_id) - # Acquire lock to avoid doing OCR on the same document more than once - # concurrently + # Acquire lock to avoid doing OCR on the same document version more than + # once concurrently lock = Lock.acquire_lock(lock_id, LOCK_EXPIRE) logger.debug('acquired lock: %s', lock_id) - document = None + document_version = None try: - logger.info('Starting document OCR for document: %d', document_pk) - document = Document.objects.get(pk=document_pk) - do_document_ocr(document) + logger.info('Starting document OCR for document version: %d', document_version_pk) + document_version = DocumentVersion.objects.get(pk=document_version_pk) + do_document_ocr(document_version) except Exception as exception: - logger.error('OCR error for document: %d; %s', document_pk, exception) - document_queue = DocumentQueue.objects.get(name='default') - if document: - queue_document, created = document_queue.documents.get_or_create(document=document) - queue_document.node_name = platform.node() + logger.error('OCR error for document version: %d; %s', document_version_pk, exception) + if document_version: + entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version) if settings.DEBUG: result = [] type, value, tb = sys.exc_info() result.append('%s: %s' % (type.__name__, value)) result.extend(traceback.format_tb(tb)) - queue_document.result = '\n'.join(result) + entry.result = '\n'.join(result) else: - queue_document.result = exception + entry.result = exception - queue_document.save() + entry.save() else: - logger.info('OCR for document: %d ended', document_pk) - document_queue = DocumentQueue.objects.get(name='default') + logger.info('OCR for document: %d ended', document_version_pk) try: - queue_document = document_queue.documents.get(document=document) - except QueueDocument.DoesNotExist: + entry = DocumentVersionOCRError.objects.get(document_version=document_version) + except DocumentVersionOCRError.DoesNotExist: pass else: - queue_document.delete() + entry.delete() finally: lock.release() except LockError: - logger.debug('unable to obtain lock') + logger.debug('unable to obtain lock: %s' % lock_id) pass diff --git a/mayan/apps/ocr/tests.py b/mayan/apps/ocr/tests.py index a6f48f4c4a..efa0438838 100644 --- a/mayan/apps/ocr/tests.py +++ b/mayan/apps/ocr/tests.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.core.files.base import File from django.test import TransactionTestCase @@ -6,8 +6,6 @@ from django.test import TransactionTestCase from documents.models import Document, DocumentType from documents.tests import TEST_SMALL_DOCUMENT_PATH, TEST_DOCUMENT_TYPE -from .models import DocumentQueue, QueueDocument - class DocumentOCRTestCase(TransactionTestCase): def setUp(self): @@ -16,11 +14,6 @@ class DocumentOCRTestCase(TransactionTestCase): with open(TEST_SMALL_DOCUMENT_PATH) as file_object: self.document = Document.objects.new_document(file_object=File(file_object), document_type=self.document_type)[0].document - DocumentQueue.objects.get_or_create(name='default') - - # Clear OCR queue - QueueDocument.objects.all().delete() - def _test_ocr_language_issue_16(self, language, result): """ Reusable OCR test for a specific language diff --git a/mayan/apps/ocr/urls.py b/mayan/apps/ocr/urls.py index 4b4f5b8436..629f9e9888 100644 --- a/mayan/apps/ocr/urls.py +++ b/mayan/apps/ocr/urls.py @@ -1,19 +1,21 @@ +from __future__ import unicode_literals + from django.conf.urls import patterns, url -from .api_views import DocumentOCRView +from .api_views import DocumentVersionOCRView urlpatterns = patterns('ocr.views', - url(r'^document/(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), - url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'), - url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'), - url(r'^queue/document/(?P\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'), - url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'), - url(r'^queue/document/(?P\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'), - url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'), + url(r'^document/(?P\d+)/submit/$', 'document_submit', (), 'document_submit'), + url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'), + url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'), - url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'), + url(r'^all/$', 'entry_list', (), 'entry_list'), + url(r'^(?P\d+)/delete/$', 'entry_delete', (), 'entry_delete'), + url(r'^multiple/delete/$', 'entry_delete_multiple', (), 'entry_delete_multiple'), + url(r'^(?P\d+)/re-queue/$', 'entry_re_queue', (), 'entry_re_queue'), + url(r'^multiple/re-queue/$', 'entry_re_queue_multiple', (), 'entry_re_queue_multiple'), ) api_urls = patterns('', - url(r'^submit/$', DocumentOCRView.as_view(), name='document-ocr-submit-view'), + url(r'^submit/$', DocumentVersionOCRView.as_view(), name='document-version-ocr-submit-view'), ) diff --git a/mayan/apps/ocr/views.py b/mayan/apps/ocr/views.py index 416fd9801e..ae4b80c1a3 100644 --- a/mayan/apps/ocr/views.py +++ b/mayan/apps/ocr/views.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals from django.contrib import messages from django.core.exceptions import PermissionDenied @@ -6,173 +6,43 @@ from django.core.urlresolvers import reverse from django.http import HttpResponseRedirect from django.shortcuts import get_object_or_404, render_to_response from django.template import RequestContext -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext_lazy as _, ungettext from acls.models import AccessEntry -from common.utils import encapsulate -from documents.models import Document -from documents.widgets import document_link, document_thumbnail +from documents.models import Document, DocumentVersion from permissions.models import Permission from .api import clean_pages -from .models import DocumentQueue, QueueDocument +from .models import DocumentVersionOCRError from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE) -def queue_document_list(request, queue_name='default'): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) - - document_queue = get_object_or_404(DocumentQueue, name=queue_name) - - context = { - 'object_list': document_queue.documents.all(), - 'title': _(u'Documents in queue: %s') % document_queue, - 'hide_object': True, - 'queue': document_queue, - 'navigation_object_name': 'queue', - 'list_object_variable_name': 'queue_document', - 'extra_columns': [ - {'name': _('Document'), 'attribute': encapsulate(lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.'))}, - {'name': _(u'Thumbnail'), 'attribute': encapsulate(lambda x: document_thumbnail(x.document))}, - {'name': _('Added'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True}, - {'name': _('Node'), 'attribute': 'node_name'}, - {'name': _('Result'), 'attribute': 'result'}, - ], - } - - return render_to_response('main/generic_list.html', context, - context_instance=RequestContext(request)) - - -def queue_document_delete(request, queue_document_id=None, queue_document_id_list=None): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE]) - - if queue_document_id: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)] - elif queue_document_id_list: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')] - else: - messages.error(request, _(u'Must provide at least one queue document.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) - - if request.method == 'POST': - for queue_document in queue_documents: - try: - queue_document.delete() - messages.success(request, _(u'Queue document: %(document)s deleted successfully.') % { - 'document': queue_document.document}) - - except Exception as exception: - messages.error(request, _(u'Error deleting document: %(document)s; %(error)s') % { - 'document': queue_document, 'error': exception}) - return HttpResponseRedirect(next) - - context = { - 'next': next, - 'previous': previous, - 'delete_view': True, - } - - if len(queue_documents) == 1: - context['object'] = queue_documents[0] - context['title'] = _(u'Are you sure you wish to delete queue document: %s?') % ', '.join([unicode(d) for d in queue_documents]) - elif len(queue_documents) > 1: - context['title'] = _(u'Are you sure you wish to delete queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents]) - - return render_to_response('main/generic_confirm.html', context, - context_instance=RequestContext(request)) - - -def queue_document_multiple_delete(request): - return queue_document_delete(request, queue_document_id_list=request.GET.get('id_list', '')) - - -def submit_document_multiple(request): - for item_id in request.GET.get('id_list', '').split(','): - submit_document(request, item_id) - - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - - -def submit_document(request, document_id): - document = get_object_or_404(Document, pk=document_id) +def document_submit(request, pk): + document = get_object_or_404(Document, pk=pk) try: Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) except PermissionDenied: AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document) - return submit_document_to_queue(request, document=document, - post_submit_redirect=request.META.get('HTTP_REFERER', reverse('main:home'))) - - -def submit_document_to_queue(request, document, post_submit_redirect=None): - """ - This view is meant to be reusable - """ - document.submit_for_ocr() - messages.success(request, _(u'Document: %(document)s was added to the OCR queue.') % { + messages.success(request, _('Document: %(document)s was added to the OCR queue.') % { 'document': document} ) - if post_submit_redirect: - return HttpResponseRedirect(post_submit_redirect) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) -def re_queue_document(request, queue_document_id=None, queue_document_id_list=None): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) +def document_submit_multiple(request): + for item_id in request.GET.get('id_list', '').split(','): + document_submit(request, item_id) - if queue_document_id: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)] - elif queue_document_id_list: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')] - else: - messages.error(request, _(u'Must provide at least one queue document.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) - - if request.method == 'POST': - for queue_document in queue_documents: - try: - queue_document.document.submit_for_ocr() - messages.success( - request, - _(u'Document: %(document)s was re-queued for OCR.') % { - 'document': queue_document.document - } - ) - except Document.DoesNotExist: - messages.error(request, _(u'Document id#: %d, no longer exists.') % queue_document.document_id) - return HttpResponseRedirect(next) - - context = { - 'next': next, - 'previous': previous, - } - - if len(queue_documents) == 1: - context['object'] = queue_documents[0] - context['title'] = _(u'Are you sure you wish to re-queue document: %s?') % ', '.join([unicode(d) for d in queue_documents]) - elif len(queue_documents) > 1: - context['title'] = _(u'Are you sure you wish to re-queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents]) - - return render_to_response('main/generic_confirm.html', context, - context_instance=RequestContext(request)) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) -def re_queue_multiple_document(request): - return re_queue_document(request, queue_document_id_list=request.GET.get('id_list', [])) - - -def all_document_ocr_cleanup(request): +def document_all_ocr_cleanup(request): Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES]) previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) @@ -182,27 +52,133 @@ def all_document_ocr_cleanup(request): return render_to_response('main/generic_confirm.html', { 'previous': previous, 'next': next, - 'title': _(u'Are you sure you wish to clean up all the pages content?'), - 'message': _(u'On large databases this operation may take some time to execute.'), + 'title': _('Are you sure you wish to clean up all the pages content?'), + 'message': _('On large databases this operation may take some time to execute.'), }, context_instance=RequestContext(request)) else: try: + # TODO: turn this into a Celery task clean_pages() - messages.success(request, _(u'Document pages content clean up complete.')) + messages.success(request, _('Document pages content clean up complete.')) except Exception as exception: - messages.error(request, _(u'Document pages content clean up error: %s') % exception) + messages.error(request, _('Document pages content clean up error: %s') % exception) return HttpResponseRedirect(next) -def display_link(obj): - output = [] - if hasattr(obj, 'get_absolute_url'): - output.append(u'%(obj)s' % { - 'url': obj.get_absolute_url(), - 'obj': obj - }) - if output: - return u''.join(output) +def entry_list(request): + Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) + + context = { + 'object_list': DocumentVersionOCRError.objects.all(), + 'title': _('OCR errors'), + 'hide_object': True, + } + + return render_to_response('main/generic_list.html', context, + context_instance=RequestContext(request)) + + +def entry_delete(request, pk=None, pk_list=None): + Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE]) + + if pk: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)] + elif pk_list: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')] else: - return obj + messages.error(request, _('Make at least one selection.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) + + if request.method == 'POST': + for entry in entries: + try: + entry.delete() + messages.success(request, _('Entry: %(entry)s deleted successfully.') % { + 'entry': entry}) + + except Exception as exception: + messages.error(request, _('Error entry: %(entry)s; %(error)s') % { + 'entry': entry, 'error': exception}) + return HttpResponseRedirect(next) + + context = { + 'next': next, + 'previous': previous, + 'delete_view': True, + } + + if len(entries) == 1: + context['object'] = entries[0] + + context['title'] = ungettext( + 'Are you sure you wish to delete the entry: %(entry)s?', + 'Are you sure you wish to delete these %(count)d entries.', + len(entries) + ) % { + 'count': len(entries), + 'entry': entries[0], + } + + return render_to_response('main/generic_confirm.html', context, + context_instance=RequestContext(request)) + + +def entry_delete_multiple(request): + return entry_delete(request, pk_list=request.GET.get('id_list', '')) + + +def entry_re_queue(request, pk=None, pk_list=None): + Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) + + if pk: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)] + elif pk_list: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')] + else: + messages.error(request, _('Make at least one selection.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) + + if request.method == 'POST': + for entry in entries: + try: + entry.document_version.submit_for_ocr() + messages.success( + request, + _('Entry: %(entry)s was re-queued for OCR.') % { + 'entry': entry + } + ) + except DocumentVersion.DoesNotExist: + messages.error(request, _('Document version id#: %d, no longer exists.') % entry.document_version_id) + return HttpResponseRedirect(next) + + context = { + 'next': next, + 'previous': previous, + } + + if len(entries) == 1: + context['object'] = entries[0] + + context['title'] = ungettext( + 'Are you sure you wish to re-queue the entry: %(entry)s?', + 'Are you sure you wish to re-queue these %(count)d entries.', + len(entries) + ) % { + 'count': len(entries), + 'entry': entries[0], + } + + return render_to_response('main/generic_confirm.html', context, + context_instance=RequestContext(request)) + + +def entry_re_queue_multiple(request): + return entry_re_queue(request, pk_list=request.GET.get('id_list', []))