Text parsers and OCR backends are now used in tandem for each document.

2015-08-08 04:49:08 -04:00
parent cf00ba2c40
commit bec85f38f4
9 changed files with 115 additions and 18 deletions
--- a/contrib/sample_documents/hybrid_text_and_image.pdf
+++ b/contrib/sample_documents/hybrid_text_and_image.pdf
--- a/docs/releases/2.0.rst
+++ b/docs/releases/2.0.rst
@@ -77,6 +77,7 @@ What's new in Mayan EDMS v2.0
 * Support to share an index as a FUSE filesystem.
 * Preview images' titles are clickable.
 * Improved API
+* Text parsers and OCR backend are used in tandem.

 Upgrading from a previous version
 =================================
--- a/mayan/apps/documents/test_models.py
+++ b/mayan/apps/documents/test_models.py
@@ -48,6 +48,10 @@ TEST_COMPRESSED_DOCUMENT_PATH = os.path.join(
 )
 TEST_DOCUMENT_DESCRIPTION = 'test description'
 TEST_DOCUMENT_TYPE = 'test_document_type'
+TEST_HYBRID_DOCUMENT = 'hybrid_text_and_image.pdf'
+TEST_HYBRID_DOCUMENT_PATH = os.path.join(
+    'contrib', 'sample_documents', TEST_HYBRID_DOCUMENT
+)


 class DocumentTestCase(TestCase):
--- a/mayan/apps/ocr/classes.py
+++ b/mayan/apps/ocr/classes.py
@@ -2,13 +2,47 @@ from __future__ import unicode_literals

 import logging

+from django.utils.module_loading import import_string
+
 from converter import converter_class

+from .exceptions import NoMIMETypeMatch, ParserError
 from .models import DocumentPageContent
+from .parsers import Parser
+from .settings import setting_ocr_backend

 logger = logging.getLogger(__name__)


+class TextExtractor(object):
+    @classmethod
+    def perform_ocr(cls, document_page):
+        ocr_backend_class = import_string(setting_ocr_backend.value)
+        backend = ocr_backend_class()
+        backend.process_document_page(document_page)
+
+    @classmethod
+    def process_document_page(cls, document_page):
+        """
+        Extract text for a document version's page. Try parsing the page and if
+        no there are not parsers for the MIME type or the parser return nothing
+        fallback to doing and OCR of the page.
+        """
+
+        try:
+            Parser.parse_document_page(document_page=document_page)
+        except (NoMIMETypeMatch, ParserError):
+            cls.perform_ocr(document_page=document_page)
+        else:
+            if not document_page.ocr_content.content:
+                cls.perform_ocr(document_page=document_page)
+
+    @classmethod
+    def process_document_version(cls, document_version):
+        for document_page in document_version.pages.all():
+            cls.process_document_page(document_page=document_page)
+
+
 class OCRBackendBase(object):
    def process_document_version(self, document_version):
        logger.info('Starting OCR for document version: %s', document_version)
@@ -17,9 +51,9 @@ class OCRBackendBase(object):
        language = document_version.document.language

        for document_page in document_version.pages.all():
-            self.process_document_page(document_page=document_page, language=language)
+            self.process_document_page(document_page=document_page)

-    def process_document_page(self, document_page, language=None):
+    def process_document_page(self, document_page):
            logger.info(
                'Processing page: %d of document version: %s',
                document_page.page_number, document_page.document_version
@@ -32,7 +66,7 @@ class OCRBackendBase(object):
                    document_page=document_page
                )
                document_page_content.content = self.execute(
-                    file_object=image, language=language
+                    file_object=image, language=document_page.document.language
                )
                document_page_content.save()
            finally:
--- a/mayan/apps/ocr/parsers.py
+++ b/mayan/apps/ocr/parsers.py
@@ -38,7 +38,7 @@ class Parser(object):
                ).append(parser_class)

    @classmethod
-    def process_document_version(cls, document_version):
+    def parse_document_version(cls, document_version):
        try:
            for parser_class in cls._registry[document_version.mimetype]:
                try:
@@ -56,6 +56,24 @@ class Parser(object):
        except KeyError:
            raise NoMIMETypeMatch

+    @classmethod
+    def parse_document_page(cls, document_page):
+        try:
+            for parser_class in cls._registry[document_page.document_version.mimetype]:
+                try:
+                    parser = parser_class()
+                    parser.process_document_page(document_page)
+                except ParserError:
+                    # If parser raises error, try next parser in the list
+                    pass
+                else:
+                    # If parser was successfull there is no need to try
+                    # others in the list for this mimetype
+                    return
+            raise NoMIMETypeMatch('Parser MIME type list exhausted')
+        except KeyError:
+            raise NoMIMETypeMatch
+
    def process_document_version(self, document_version):
        logger.info('Starting parsing for document version: %s', document_version)
        logger.debug('document version: %d', document_version.pk)
@@ -139,10 +157,14 @@ class PopplerParser(Parser):
            raise ParserError

        output = proc.stdout.read()
+
        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            return ''

+        if output[-3:] == b'\x0a\x0a\x0c':
+            return output[:-3]
+
        return output


--- a/mayan/apps/ocr/runtime.py
+++ b/mayan/apps/ocr/runtime.py
@@ -1,5 +0,0 @@
-from django.utils.module_loading import import_string
-
-from .settings import setting_ocr_backend
-
-ocr_backend_class = import_string(setting_ocr_backend.value)
--- a/mayan/apps/ocr/tasks.py
+++ b/mayan/apps/ocr/tasks.py
@@ -11,7 +11,7 @@ from documents.models import DocumentVersion
 from lock_manager import Lock, LockError
 from mayan.celery import app

-from .runtime import ocr_backend_class
+from .classes import TextExtractor
 from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
 from .models import DocumentVersionOCRError
 from .signals import post_document_version_ocr
@@ -35,8 +35,7 @@ def task_do_ocr(self, document_version_pk):
                'Starting document OCR for document version: %s',
                document_version
            )
-            backend = ocr_backend_class()
-            backend.process_document_version(document_version)
+            TextExtractor.process_document_version(document_version)
        except OperationalError as exception:
            logger.warning(
                'OCR error for document version: %s; %s. Retrying.',
--- a/mayan/apps/ocr/test_models.py
+++ b/mayan/apps/ocr/test_models.py
@@ -1,4 +1,5 @@
 # -*- coding: utf-8 -*-
+
 from __future__ import unicode_literals

 from django.core.files.base import File
@@ -19,7 +20,7 @@ class DocumentOCRTestCase(TestCase):

        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
            self.document = self.document_type.new_document(
-                file_object=File(file_object), label='small document'
+                file_object=File(file_object),
            )

    def tearDown(self):
@@ -27,8 +28,10 @@ class DocumentOCRTestCase(TestCase):
        self.document_type.delete()

    def test_ocr_language_backends_end(self):
+        content = self.document.pages.first().ocr_content.content
+
        self.assertTrue(
-            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+            'Mayan EDMS Documentation' in content
        )


@@ -56,9 +59,11 @@ class GermanOCRSupportTestCase(TestCase):
        self.document_type.delete()

    def test_ocr_language_backends_end(self):
+        content = self.document.pages.first().ocr_content.content
+
        self.assertTrue(
-            'Repository für elektronische Dokumente.' in self.document.pages.first().ocr_content.content
+            'Repository für elektronische Dokumente.' in content
        )
        self.assertTrue(
-            'Es bietet einen elektronischen Tresor oder' in self.document.pages.first().ocr_content.content
+            'Es bietet einen elektronischen Tresor oder' in content
        )
--- a/mayan/apps/ocr/test_parsers.py
+++ b/mayan/apps/ocr/test_parsers.py
@@ -1,12 +1,14 @@
-# -*- coding: utf-8 -*-
 from __future__ import unicode_literals

 from django.core.files.base import File
 from django.test import TestCase

 from documents.models import DocumentType
-from documents.test_models import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE
+from documents.test_models import (
+    TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_HYBRID_DOCUMENT_PATH
+)

+from .classes import TextExtractor
 from .parsers import PDFMinerParser, PopplerParser


@@ -46,3 +48,38 @@ class ParserTestCase(TestCase):
        self.assertTrue(
            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
        )
+
+
+class TextExtractorTestCase(TestCase):
+    def setUp(self):
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE
+        )
+
+        ocr_settings = self.document_type.ocr_settings
+        ocr_settings.auto_ocr = False
+        ocr_settings.save()
+
+        with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=File(file_object)
+            )
+
+    def tearDown(self):
+        self.document.delete()
+        self.document_type.delete()
+
+    def test_text_extractor(self):
+        TextExtractor.process_document_version(
+            document_version=self.document.latest_version
+        )
+
+        self.assertEqual(
+            self.document.latest_version.pages.first().ocr_content.content,
+            'Sample text',
+        )
+
+        self.assertEqual(
+            self.document.latest_version.pages.last().ocr_content.content,
+            'Sample text in image form',
+        )