diff --git a/mayan/apps/ocr/admin.py b/mayan/apps/ocr/admin.py index 1bb19bf3ac..865481d602 100644 --- a/mayan/apps/ocr/admin.py +++ b/mayan/apps/ocr/admin.py @@ -3,12 +3,12 @@ from __future__ import unicode_literals from django.contrib import admin from .models import ( - DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError + DocumentPageOCRContent, DocumentTypeSettings, DocumentVersionOCRError ) -@admin.register(DocumentPageContent) -class DocumentPageContentAdmin(admin.ModelAdmin): +@admin.register(DocumentPageOCRContent) +class DocumentPageOCRContentAdmin(admin.ModelAdmin): list_display = ('document_page',) diff --git a/mayan/apps/ocr/api_views.py b/mayan/apps/ocr/api_views.py index ded56e8ed8..c3a4e20f6c 100644 --- a/mayan/apps/ocr/api_views.py +++ b/mayan/apps/ocr/api_views.py @@ -6,9 +6,9 @@ from rest_framework.response import Response from documents.models import Document, DocumentPage, DocumentVersion from rest_api.permissions import MayanPermission -from .models import DocumentPageContent +from .models import DocumentPageOCRContent from .permissions import permission_ocr_content_view, permission_ocr_document -from .serializers import DocumentPageContentSerializer +from .serializers import DocumentPageOCRContentSerializer class APIDocumentOCRView(generics.GenericAPIView): @@ -67,7 +67,7 @@ class APIDocumentVersionOCRView(generics.GenericAPIView): return Response(status=status.HTTP_202_ACCEPTED) -class APIDocumentPageContentView(generics.RetrieveAPIView): +class APIDocumentPageOCRContentView(generics.RetrieveAPIView): """ Returns the OCR content of the selected document page. --- @@ -82,7 +82,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView): 'GET': (permission_ocr_content_view,), } permission_classes = (MayanPermission,) - serializer_class = DocumentPageContentSerializer + serializer_class = DocumentPageOCRContentSerializer queryset = DocumentPage.objects.all() def retrieve(self, request, *args, **kwargs): @@ -90,8 +90,8 @@ class APIDocumentPageContentView(generics.RetrieveAPIView): try: ocr_content = instance.ocr_content - except DocumentPageContent.DoesNotExist: - ocr_content = DocumentPageContent.objects.none() + except DocumentPageOCRContent.DoesNotExist: + ocr_content = DocumentPageOCRContent.objects.none() serializer = self.get_serializer(ocr_content) return Response(serializer.data) diff --git a/mayan/apps/ocr/apps.py b/mayan/apps/ocr/apps.py index a9bccc8fe4..9ba0171b56 100644 --- a/mayan/apps/ocr/apps.py +++ b/mayan/apps/ocr/apps.py @@ -1,11 +1,13 @@ from __future__ import unicode_literals +from datetime import timedelta import logging from kombu import Exchange, Queue from django.apps import apps from django.db.models.signals import post_save +from django.utils.timezone import now from django.utils.translation import ugettext_lazy as _ from acls import ModelPermission @@ -21,7 +23,10 @@ from mayan.celery import app from navigation import SourceColumn from rest_api.classes import APIEndPoint -from .handlers import initialize_new_ocr_settings, post_version_upload_ocr +from .events import event_ocr_document_version_submit +from .handlers import ( + handler_initialize_new_ocr_settings, handler_ocr_document_version, +) from .links import ( link_document_content, link_document_ocr_download, link_document_ocr_erros_list, link_document_submit, @@ -36,17 +41,17 @@ logger = logging.getLogger(__name__) def document_ocr_submit(self): - from .tasks import task_do_ocr - - task_do_ocr.apply_async(args=(self.latest_version.pk,)) + self.latest_version.submit_for_ocr() def document_version_ocr_submit(self): from .tasks import task_do_ocr + event_ocr_document_version_submit.commit(target=self) + task_do_ocr.apply_async( + eta=now() + timedelta(seconds=settings_db_sync_task_delay.value), kwargs={'document_version_pk': self.pk}, - countdown=settings_db_sync_task_delay.value ) @@ -155,10 +160,12 @@ class OCRApp(MayanAppConfig): ) post_save.connect( - initialize_new_ocr_settings, - dispatch_uid='initialize_new_ocr_settings', sender=DocumentType + dispatch_uid='ocr_handler_initialize_new_ocr_settings', + receiver=handler_initialize_new_ocr_settings, + sender=DocumentType ) post_version_upload.connect( - post_version_upload_ocr, dispatch_uid='post_version_upload_ocr', + dispatch_uid='ocr_handler_ocr_document_version', + receiver=handler_ocr_document_version, sender=DocumentVersion ) diff --git a/mayan/apps/ocr/classes.py b/mayan/apps/ocr/classes.py index 91e1561e5c..337098bff2 100644 --- a/mayan/apps/ocr/classes.py +++ b/mayan/apps/ocr/classes.py @@ -1,80 +1,9 @@ from __future__ import unicode_literals -import logging - -from django.utils.module_loading import import_string - from converter import converter_class -from documents.runtime import cache_storage_backend - -from .exceptions import NoMIMETypeMatch, ParserError -from .models import DocumentPageContent -from .parsers import Parser -from .settings import setting_ocr_backend - -logger = logging.getLogger(__name__) - - -class TextExtractor(object): - @classmethod - def perform_ocr(cls, document_page): - ocr_backend_class = import_string(setting_ocr_backend.value) - backend = ocr_backend_class() - backend.process_document_page(document_page) - - @classmethod - def process_document_page(cls, document_page): - """ - Extract text for a document version's page. Try parsing the page and if - no there are not parsers for the MIME type or the parser return nothing - fallback to doing and OCR of the page. - """ - - try: - Parser.parse_document_page(document_page=document_page) - except (NoMIMETypeMatch, ParserError): - cls.perform_ocr(document_page=document_page) - else: - if not document_page.ocr_content.content: - cls.perform_ocr(document_page=document_page) - - @classmethod - def process_document_version(cls, document_version): - for document_page in document_version.pages.all(): - cls.process_document_page(document_page=document_page) class OCRBackendBase(object): - def process_document_version(self, document_version): - logger.info('Starting OCR for document version: %s', document_version) - logger.debug('document version: %d', document_version.pk) - - for document_page in document_version.pages.all(): - self.process_document_page(document_page=document_page) - - def process_document_page(self, document_page): - logger.info( - 'Processing page: %d of document version: %s', - document_page.page_number, document_page.document_version - ) - - cache_filename = document_page.generate_image() - - with cache_storage_backend.open(cache_filename) as file_object: - document_page_content, created = DocumentPageContent.objects.get_or_create( - document_page=document_page - ) - document_page_content.content = self.execute( - file_object=file_object, - language=document_page.document.language - ) - document_page_content.save() - - logger.info( - 'Finished processing page: %d of document version: %s', - document_page.page_number, document_page.document_version - ) - def execute(self, file_object, language=None, transformations=None): self.language = language diff --git a/mayan/apps/ocr/events.py b/mayan/apps/ocr/events.py new file mode 100644 index 0000000000..ac330df821 --- /dev/null +++ b/mayan/apps/ocr/events.py @@ -0,0 +1,14 @@ +from __future__ import absolute_import, unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from events.classes import Event + +event_ocr_document_version_submit = Event( + name='ocr_document_version_submit', + label=_('Document version submitted for OCR') +) +event_ocr_document_version_finish = Event( + name='ocr_document_version_finish', + label=_('Document version OCR finished') +) diff --git a/mayan/apps/ocr/exceptions.py b/mayan/apps/ocr/exceptions.py index 9fc7a9b90a..686aa75914 100644 --- a/mayan/apps/ocr/exceptions.py +++ b/mayan/apps/ocr/exceptions.py @@ -6,17 +6,3 @@ class OCRError(Exception): Raised by the OCR backend """ pass - - -class ParserError(Exception): - """ - Base exception for file parsers - """ - pass - - -class NoMIMETypeMatch(ParserError): - """ - There is no parser registered for the specified MIME type - """ - pass diff --git a/mayan/apps/ocr/forms.py b/mayan/apps/ocr/forms.py index 8d1729138c..723912e0b2 100644 --- a/mayan/apps/ocr/forms.py +++ b/mayan/apps/ocr/forms.py @@ -9,17 +9,17 @@ from django.utils.translation import ugettext_lazy as _, ugettext from common.widgets import TextAreaDiv from documents.models import DocumentType -from .models import DocumentPageContent +from .models import DocumentPageOCRContent -class DocumentContentForm(forms.Form): +class DocumentOCRContentForm(forms.Form): """ Form that concatenates all of a document pages' text content into a single textarea widget """ def __init__(self, *args, **kwargs): self.document = kwargs.pop('instance', None) - super(DocumentContentForm, self).__init__(*args, **kwargs) + super(DocumentOCRContentForm, self).__init__(*args, **kwargs) content = [] self.fields['contents'].initial = '' try: @@ -30,7 +30,7 @@ class DocumentContentForm(forms.Form): for page in document_pages: try: page_content = page.ocr_content.content - except DocumentPageContent.DoesNotExist: + except DocumentPageOCRContent.DoesNotExist: pass else: content.append(conditional_escape(force_text(page_content))) diff --git a/mayan/apps/ocr/handlers.py b/mayan/apps/ocr/handlers.py index e8ea9fee90..9d03dfe3d6 100644 --- a/mayan/apps/ocr/handlers.py +++ b/mayan/apps/ocr/handlers.py @@ -9,14 +9,7 @@ from .settings import setting_auto_ocr logger = logging.getLogger(__name__) -def post_version_upload_ocr(sender, instance, **kwargs): - logger.debug('received post_version_upload') - logger.debug('instance pk: %s', instance.pk) - if instance.document.document_type.ocr_settings.auto_ocr: - instance.submit_for_ocr() - - -def initialize_new_ocr_settings(sender, instance, **kwargs): +def handler_initialize_new_ocr_settings(sender, instance, **kwargs): DocumentTypeSettings = apps.get_model( app_label='ocr', model_name='DocumentTypeSettings' ) @@ -25,3 +18,10 @@ def initialize_new_ocr_settings(sender, instance, **kwargs): DocumentTypeSettings.objects.create( document_type=instance, auto_ocr=setting_auto_ocr.value ) + + +def handler_ocr_document_version(sender, instance, **kwargs): + logger.debug('received post_version_upload') + logger.debug('instance pk: %s', instance.pk) + if instance.document.document_type.ocr_settings.auto_ocr: + instance.submit_for_ocr() diff --git a/mayan/apps/ocr/managers.py b/mayan/apps/ocr/managers.py new file mode 100644 index 0000000000..4f88928ca2 --- /dev/null +++ b/mayan/apps/ocr/managers.py @@ -0,0 +1,82 @@ +from __future__ import unicode_literals + +import logging +import sys +import traceback + +from django.apps import apps +from django.conf import settings +from django.db import models + +from documents.runtime import cache_storage_backend + +from .events import event_ocr_document_version_finish +from .runtime import ocr_backend +from .signals import post_document_version_ocr + +logger = logging.getLogger(__name__) + + +class DocumentPageOCRContentManager(models.Manager): + def process_document_version(self, document_version): + logger.info('Starting OCR for document version: %s', document_version) + logger.debug('document version: %d', document_version.pk) + + try: + for document_page in document_version.pages.all(): + self.process_document_page(document_page=document_page) + except Exception as exception: + logger.error( + 'OCR error for document version: %d; %s', document_version, + exception + ) + + if settings.DEBUG: + result = [] + type, value, tb = sys.exc_info() + result.append('%s: %s' % (type.__name__, value)) + result.extend(traceback.format_tb(tb)) + document_version.ocr_errors.create( + result='\n'.join(result) + ) + else: + document_version.ocr_errors.create(result=exception) + else: + logger.info( + 'OCR complete for document version: %s', document_version + ) + document_version.ocr_errors.all().delete() + + event_ocr_document_version_finish.commit(target=document_version) + + post_document_version_ocr.send( + sender=document_version.__class__, instance=document_version + ) + + def process_document_page(self, document_page): + logger.info( + 'Processing page: %d of document version: %s', + document_page.page_number, document_page.document_version + ) + + DocumentPageOCRContent = apps.get_model( + app_label='ocr', model_name='DocumentPageOCRContent' + ) + + # TODO: Call task and wait + cache_filename = document_page.generate_image() + + with cache_storage_backend.open(cache_filename) as file_object: + document_page_content, created = DocumentPageOCRContent.objects.get_or_create( + document_page=document_page + ) + document_page_content.content = ocr_backend.execute( + file_object=file_object, + language=document_page.document.language + ) + document_page_content.save() + + logger.info( + 'Finished processing page: %d of document version: %s', + document_page.page_number, document_page.document_version + ) diff --git a/mayan/apps/ocr/migrations/0006_auto_20170823_0553.py b/mayan/apps/ocr/migrations/0006_auto_20170823_0553.py new file mode 100644 index 0000000000..19966f4448 --- /dev/null +++ b/mayan/apps/ocr/migrations/0006_auto_20170823_0553.py @@ -0,0 +1,67 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.10.7 on 2017-08-23 05:53 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0040_auto_20170725_1111'), + ('ocr', '0005_auto_20170630_1846'), + ] + + operations = [ + migrations.CreateModel( + name='DocumentPageOCRContent', + fields=[ + ( + 'id', models.AutoField( + auto_created=True, primary_key=True, serialize=False, + verbose_name='ID' + ) + ), + ( + 'content', models.TextField( + blank=True, verbose_name='Content' + ) + ), + ( + 'document_page', models.OneToOneField( + on_delete=django.db.models.deletion.CASCADE, + related_name='ocr', to='documents.DocumentPage', + verbose_name='Document page' + ) + ), + ], + options={ + 'verbose_name': 'Document page OCR content', + 'verbose_name_plural': 'Document pages OCR contents', + }, + ), + migrations.RemoveField( + model_name='documentpagecontent', + name='document_page', + ), + migrations.AlterModelOptions( + name='documentversionocrerror', + options={ + 'ordering': ('datetime_submitted',), + 'verbose_name': 'Document version OCR error', + 'verbose_name_plural': 'Document version OCR errors' + }, + ), + migrations.AlterField( + model_name='documentversionocrerror', + name='datetime_submitted', + field=models.DateTimeField( + auto_now_add=True, db_index=True, + verbose_name='Date time submitted' + ), + ), + migrations.DeleteModel( + name='DocumentPageContent', + ), + ] diff --git a/mayan/apps/ocr/models.py b/mayan/apps/ocr/models.py index d8b35691f5..bf8982a9f1 100644 --- a/mayan/apps/ocr/models.py +++ b/mayan/apps/ocr/models.py @@ -6,6 +6,8 @@ from django.utils.translation import ugettext_lazy as _ from documents.models import DocumentPage, DocumentType, DocumentVersion +from .managers import DocumentPageOCRContentManager + class DocumentTypeSettings(models.Model): """ @@ -25,6 +27,24 @@ class DocumentTypeSettings(models.Model): verbose_name_plural = _('Document types settings') +@python_2_unicode_compatible +class DocumentPageOCRContent(models.Model): + document_page = models.OneToOneField( + DocumentPage, on_delete=models.CASCADE, related_name='ocr_content', + verbose_name=_('Document page') + ) + content = models.TextField(blank=True, verbose_name=_('Content')) + + objects = DocumentPageOCRContentManager() + + def __str__(self): + return force_text(self.document_page) + + class Meta: + verbose_name = _('Document page OCR content') + verbose_name_plural = _('Document pages OCR contents') + + @python_2_unicode_compatible class DocumentVersionOCRError(models.Model): document_version = models.ForeignKey( @@ -32,7 +52,7 @@ class DocumentVersionOCRError(models.Model): verbose_name=_('Document version') ) datetime_submitted = models.DateTimeField( - auto_now=True, db_index=True, verbose_name=_('Date time submitted') + auto_now_add=True, db_index=True, verbose_name=_('Date time submitted') ) result = models.TextField(blank=True, null=True, verbose_name=_('Result')) @@ -41,24 +61,5 @@ class DocumentVersionOCRError(models.Model): class Meta: ordering = ('datetime_submitted',) - verbose_name = _('Document Version OCR Error') - verbose_name_plural = _('Document Version OCR Errors') - - -@python_2_unicode_compatible -class DocumentPageContent(models.Model): - """ - Model that describes a document page content - """ - document_page = models.OneToOneField( - DocumentPage, on_delete=models.CASCADE, related_name='ocr_content', - verbose_name=_('Document page') - ) - content = models.TextField(blank=True, verbose_name=_('Content')) - - def __str__(self): - return force_text(self.document_page) - - class Meta: - verbose_name = _('Document page content') - verbose_name_plural = _('Document pages contents') + verbose_name = _('Document version OCR error') + verbose_name_plural = _('Document version OCR errors') diff --git a/mayan/apps/ocr/parsers.py b/mayan/apps/ocr/parsers.py deleted file mode 100644 index 87570afa1f..0000000000 --- a/mayan/apps/ocr/parsers.py +++ /dev/null @@ -1,202 +0,0 @@ -from __future__ import unicode_literals - -from io import BytesIO -import logging -import os -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.pdfpage import PDFPage -from pdfminer.converter import TextConverter -from pdfminer.layout import LAParams -import subprocess - -from django.utils.translation import ugettext_lazy as _ - -from common.utils import copyfile, fs_cleanup, mkstemp - -from .exceptions import ParserError, NoMIMETypeMatch -from .models import DocumentPageContent -from .settings import setting_pdftotext_path - -logger = logging.getLogger(__name__) - - -class Parser(object): - """ - Parser base class - """ - - _registry = {} - - @classmethod - def register(cls, mimetypes, parser_classes): - for mimetype in mimetypes: - for parser_class in parser_classes: - cls._registry.setdefault( - mimetype, [] - ).append(parser_class) - - @classmethod - def parse_document_version(cls, document_version): - try: - for parser_class in cls._registry[document_version.mimetype]: - try: - parser = parser_class() - parser.process_document_version(document_version) - except ParserError: - # If parser raises error, try next parser in the list - pass - else: - # If parser was successfull there is no need to try - # others in the list for this mimetype - return - - raise NoMIMETypeMatch('Parser MIME type list exhausted') - except KeyError: - raise NoMIMETypeMatch - - @classmethod - def parse_document_page(cls, document_page): - try: - for parser_class in cls._registry[document_page.document_version.mimetype]: - try: - parser = parser_class() - parser.process_document_page(document_page) - except ParserError: - # If parser raises error, try next parser in the list - pass - else: - # If parser was successfull there is no need to try - # others in the list for this mimetype - return - raise NoMIMETypeMatch('Parser MIME type list exhausted') - except KeyError: - raise NoMIMETypeMatch - - def process_document_version(self, document_version): - logger.info( - 'Starting parsing for document version: %s', document_version - ) - logger.debug('document version: %d', document_version.pk) - - for document_page in document_version.pages.all(): - self.process_document_page(document_page=document_page) - - def process_document_page(self, document_page): - logger.info( - 'Processing page: %d of document version: %s', - document_page.page_number, document_page.document_version - ) - - file_object = document_page.document_version.get_intermidiate_file() - - try: - document_page_content, created = DocumentPageContent.objects.get_or_create( - document_page=document_page - ) - document_page_content.content = self.execute( - file_object=file_object, page_number=document_page.page_number - ) - document_page_content.save() - except Exception as exception: - error_message = _('Exception parsing page; %s') % exception - logger.error(error_message) - raise ParserError(error_message) - finally: - file_object.close() - - logger.info( - 'Finished processing page: %d of document version: %s', - document_page.page_number, document_page.document_version - ) - - def execute(self, file_object, page_number): - raise NotImplementedError( - 'Your %s class has not defined the required execute() method.' % - self.__class__.__name__ - ) - - -class PopplerParser(Parser): - """ - PDF parser using the pdftotext execute from the poppler package - """ - - def __init__(self): - self.pdftotext_path = setting_pdftotext_path.value - if not os.path.exists(self.pdftotext_path): - error_message = _( - 'Cannot find pdftotext executable at: %s' - ) % self.pdftotext_path - logger.error(error_message) - raise ParserError(error_message) - - logger.debug('self.pdftotext_path: %s', self.pdftotext_path) - - def execute(self, file_object, page_number): - logger.debug('Parsing PDF page: %d', page_number) - - destination_descriptor, temp_filepath = mkstemp() - copyfile(file_object, temp_filepath) - - command = [] - command.append(self.pdftotext_path) - command.append('-f') - command.append(str(page_number)) - command.append('-l') - command.append(str(page_number)) - command.append(temp_filepath) - command.append('-') - - proc = subprocess.Popen( - command, close_fds=True, stderr=subprocess.PIPE, - stdout=subprocess.PIPE - ) - return_code = proc.wait() - if return_code != 0: - logger.error(proc.stderr.readline()) - fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) - - raise ParserError - - output = proc.stdout.read() - fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) - - if output == b'\x0c': - logger.debug('Parser didn\'t return any output') - return '' - - if output[-3:] == b'\x0a\x0a\x0c': - return output[:-3] - - return output - - -class PDFMinerParser(Parser): - """ - Parser for PDF files using the PDFMiner library for Python - """ - - def execute(self, file_object, page_number): - logger.debug('Parsing PDF page: %d', page_number) - - with BytesIO() as string_buffer: - rsrcmgr = PDFResourceManager() - device = TextConverter( - rsrcmgr, outfp=string_buffer, laparams=LAParams() - ) - interpreter = PDFPageInterpreter(rsrcmgr, device) - page = PDFPage.get_pages( - file_object, maxpages=1, pagenos=(page_number - 1,) - ) - interpreter.process_page(page.next()) - device.close() - - logger.debug('Finished parsing PDF: %d', page_number) - - return string_buffer.getvalue() - - -Parser.register( - mimetypes=('application/pdf',), - parser_classes=(PopplerParser, PDFMinerParser) -) diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py new file mode 100644 index 0000000000..307f2a6a40 --- /dev/null +++ b/mayan/apps/ocr/runtime.py @@ -0,0 +1,5 @@ +from django.utils.module_loading import import_string + +from .settings import setting_ocr_backend + +ocr_backend = import_string(setting_ocr_backend.value)() diff --git a/mayan/apps/ocr/serializers.py b/mayan/apps/ocr/serializers.py index 7161d2fc40..3d9c06c18d 100644 --- a/mayan/apps/ocr/serializers.py +++ b/mayan/apps/ocr/serializers.py @@ -2,10 +2,10 @@ from __future__ import unicode_literals from rest_framework import serializers -from .models import DocumentPageContent +from .models import DocumentPageOCRContent -class DocumentPageContentSerializer(serializers.ModelSerializer): +class DocumentPageOCRContentSerializer(serializers.ModelSerializer): class Meta: fields = ('content',) - model = DocumentPageContent + model = DocumentPageOCRContent diff --git a/mayan/apps/ocr/tasks.py b/mayan/apps/ocr/tasks.py index 97a69a56a5..87294e5925 100644 --- a/mayan/apps/ocr/tasks.py +++ b/mayan/apps/ocr/tasks.py @@ -1,84 +1,53 @@ from __future__ import unicode_literals import logging -import sys -import traceback -from django.conf import settings +from django.apps import apps from django.db import OperationalError -from documents.models import DocumentVersion from lock_manager import LockError from lock_manager.runtime import locking_backend from mayan.celery import app -from .classes import TextExtractor from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE -from .models import DocumentVersionOCRError -from .signals import post_document_version_ocr logger = logging.getLogger(__name__) @app.task(bind=True, default_retry_delay=DO_OCR_RETRY_DELAY, ignore_result=True) def task_do_ocr(self, document_version_pk): + DocumentVersion = apps.get_model( + app_label='documents', model_name='DocumentVersion' + ) + DocumentPageOCRContent = apps.get_model( + app_label='ocr', model_name='DocumentPageOCRContent' + ) + lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk try: logger.debug('trying to acquire lock: %s', lock_id) - # Acquire lock to avoid doing OCR on the same document version more than - # once concurrently + # Acquire lock to avoid doing OCR on the same document version more + # than once concurrently lock = locking_backend.acquire_lock(lock_id, LOCK_EXPIRE) logger.debug('acquired lock: %s', lock_id) document_version = None try: - document_version = DocumentVersion.objects.get(pk=document_version_pk) + document_version = DocumentVersion.objects.get( + pk=document_version_pk + ) logger.info( 'Starting document OCR for document version: %s', document_version ) - TextExtractor.process_document_version(document_version) + DocumentPageOCRContent.objects.process_document_version( + document_version=document_version + ) except OperationalError as exception: logger.warning( 'OCR error for document version: %d; %s. Retrying.', document_version_pk, exception ) raise self.retry(exc=exception) - except Exception as exception: - logger.error( - 'OCR error for document version: %d; %s', document_version_pk, - exception - ) - if document_version: - entry, created = DocumentVersionOCRError.objects.get_or_create( - document_version=document_version - ) - - if settings.DEBUG: - result = [] - type, value, tb = sys.exc_info() - result.append('%s: %s' % (type.__name__, value)) - result.extend(traceback.format_tb(tb)) - entry.result = '\n'.join(result) - else: - entry.result = exception - - entry.save() - else: - logger.info( - 'OCR complete for document version: %s', document_version - ) - try: - entry = DocumentVersionOCRError.objects.get( - document_version=document_version - ) - except DocumentVersionOCRError.DoesNotExist: - pass - else: - entry.delete() - - post_document_version_ocr.send( - sender=self, instance=document_version - ) finally: lock.release() except LockError: diff --git a/mayan/apps/ocr/tests/test_events.py b/mayan/apps/ocr/tests/test_events.py new file mode 100644 index 0000000000..9d302df8f0 --- /dev/null +++ b/mayan/apps/ocr/tests/test_events.py @@ -0,0 +1,35 @@ +from __future__ import unicode_literals + +from actstream.models import Action + +from documents.tests.test_models import GenericDocumentTestCase + +from ..events import ( + event_ocr_document_version_submit, event_ocr_document_version_finish +) + + +class OCREventsTestCase(GenericDocumentTestCase): + def test_document_version_submit_event(self): + Action.objects.all().delete() + self.document.submit_for_ocr() + + self.assertEqual( + Action.objects.last().target, self.document.latest_version + ) + self.assertEqual( + Action.objects.last().verb, + event_ocr_document_version_submit.name + ) + + def test_document_version_finish_event(self): + Action.objects.all().delete() + self.document.submit_for_ocr() + + self.assertEqual( + Action.objects.first().target, self.document.latest_version + ) + self.assertEqual( + Action.objects.first().verb, + event_ocr_document_version_finish.name + ) diff --git a/mayan/apps/ocr/tests/test_parsers.py b/mayan/apps/ocr/tests/test_parsers.py deleted file mode 100644 index 9d500a572a..0000000000 --- a/mayan/apps/ocr/tests/test_parsers.py +++ /dev/null @@ -1,83 +0,0 @@ -from __future__ import unicode_literals - -from django.core.files.base import File -from django.test import override_settings - -from common.tests import BaseTestCase -from documents.models import DocumentType -from documents.tests import ( - TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH -) - -from ..classes import TextExtractor -from ..parsers import PDFMinerParser, PopplerParser - - -@override_settings(OCR_AUTO_OCR=False) -class ParserTestCase(BaseTestCase): - def setUp(self): - super(ParserTestCase, self).setUp() - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - with open(TEST_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=File(file_object) - ) - - def tearDown(self): - self.document_type.delete() - super(ParserTestCase, self).tearDown() - - def test_pdfminer_parser(self): - parser = PDFMinerParser() - - parser.process_document_version(self.document.latest_version) - - self.assertTrue( - 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content - ) - - def test_poppler_parser(self): - parser = PopplerParser() - - parser.process_document_version(self.document.latest_version) - - self.assertTrue( - 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content - ) - - -@override_settings(OCR_AUTO_OCR=False) -class TextExtractorTestCase(BaseTestCase): - def setUp(self): - super(TextExtractorTestCase, self).setUp() - - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - with open(TEST_HYBRID_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=File(file_object) - ) - - def tearDown(self): - self.document_type.delete() - super(TextExtractorTestCase, self).tearDown() - - def test_text_extractor(self): - TextExtractor.process_document_version( - document_version=self.document.latest_version - ) - - self.assertEqual( - self.document.latest_version.pages.first().ocr_content.content, - 'Sample text', - ) - - self.assertEqual( - self.document.latest_version.pages.last().ocr_content.content, - 'Sample text in image form', - ) diff --git a/mayan/apps/ocr/urls.py b/mayan/apps/ocr/urls.py index 30f1b59359..6638c2bc28 100644 --- a/mayan/apps/ocr/urls.py +++ b/mayan/apps/ocr/urls.py @@ -3,7 +3,8 @@ from __future__ import unicode_literals from django.conf.urls import url from .api_views import ( - APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView + APIDocumentOCRView, APIDocumentPageOCRContentView, + APIDocumentVersionOCRView ) from .views import ( DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView, @@ -59,7 +60,8 @@ api_urls = [ name='document-version-ocr-submit-view' ), url( - r'^page/(?P\d+)/content/$', APIDocumentPageContentView.as_view(), + r'^page/(?P\d+)/content/$', + APIDocumentPageOCRContentView.as_view(), name='document-page-content-view' ), ] diff --git a/mayan/apps/ocr/utils.py b/mayan/apps/ocr/utils.py index 8175c3040e..13724cada7 100644 --- a/mayan/apps/ocr/utils.py +++ b/mayan/apps/ocr/utils.py @@ -3,14 +3,14 @@ from __future__ import unicode_literals from django.utils.encoding import force_text from django.utils.html import conditional_escape -from .models import DocumentPageContent +from .models import DocumentPageOCRContent def get_document_ocr_content(document): for page in document.pages.all(): try: page_content = page.ocr_content.content - except DocumentPageContent.DoesNotExist: + except DocumentPageOCRContent.DoesNotExist: pass else: yield conditional_escape(force_text(page_content)) diff --git a/mayan/apps/ocr/views.py b/mayan/apps/ocr/views.py index fa0871735b..54c34ae2ec 100644 --- a/mayan/apps/ocr/views.py +++ b/mayan/apps/ocr/views.py @@ -14,7 +14,7 @@ from common.generics import ( from common.mixins import MultipleInstanceActionMixin from documents.models import Document, DocumentType -from .forms import DocumentContentForm, DocumentTypeSelectForm +from .forms import DocumentOCRContentForm, DocumentTypeSelectForm from .models import DocumentVersionOCRError from .permissions import ( permission_ocr_content_view, permission_ocr_document, @@ -40,6 +40,27 @@ class DocumentAllSubmitView(ConfirmView): ) +class DocumentOCRContent(SingleObjectDetailView): + form_class = DocumentOCRContentForm + model = Document + object_permission = permission_ocr_content_view + + def dispatch(self, request, *args, **kwargs): + result = super(DocumentOCRContent, self).dispatch( + request, *args, **kwargs + ) + self.get_object().add_as_recent_document_for_user(request.user) + return result + + def get_extra_context(self): + return { + 'document': self.get_object(), + 'hide_labels': True, + 'object': self.get_object(), + 'title': _('OCR result for document: %s') % self.get_object(), + } + + class DocumentSubmitView(ConfirmView): def get_extra_context(self): return { @@ -128,27 +149,6 @@ class DocumentTypeSettingsEditView(SingleObjectEditView): } -class DocumentOCRContent(SingleObjectDetailView): - form_class = DocumentContentForm - model = Document - object_permission = permission_ocr_content_view - - def dispatch(self, request, *args, **kwargs): - result = super(DocumentOCRContent, self).dispatch( - request, *args, **kwargs - ) - self.get_object().add_as_recent_document_for_user(request.user) - return result - - def get_extra_context(self): - return { - 'document': self.get_object(), - 'hide_labels': True, - 'object': self.get_object(), - 'title': _('OCR result for document: %s') % self.get_object(), - } - - class EntryListView(SingleObjectListView): extra_context = { 'hide_object': True,