diff --git a/contrib/sample_documents/hybrid_text_and_image.pdf b/contrib/sample_documents/hybrid_text_and_image.pdf new file mode 100644 index 0000000000..cad612ae81 Binary files /dev/null and b/contrib/sample_documents/hybrid_text_and_image.pdf differ diff --git a/docs/releases/2.0.rst b/docs/releases/2.0.rst index a2dbab348e..4adf77e92d 100644 --- a/docs/releases/2.0.rst +++ b/docs/releases/2.0.rst @@ -77,6 +77,7 @@ What's new in Mayan EDMS v2.0 * Support to share an index as a FUSE filesystem. * Preview images' titles are clickable. * Improved API +* Text parsers and OCR backend are used in tandem. Upgrading from a previous version ================================= diff --git a/mayan/apps/documents/test_models.py b/mayan/apps/documents/test_models.py index eee2553c99..b5839b4f3f 100644 --- a/mayan/apps/documents/test_models.py +++ b/mayan/apps/documents/test_models.py @@ -48,6 +48,10 @@ TEST_COMPRESSED_DOCUMENT_PATH = os.path.join( ) TEST_DOCUMENT_DESCRIPTION = 'test description' TEST_DOCUMENT_TYPE = 'test_document_type' +TEST_HYBRID_DOCUMENT = 'hybrid_text_and_image.pdf' +TEST_HYBRID_DOCUMENT_PATH = os.path.join( + 'contrib', 'sample_documents', TEST_HYBRID_DOCUMENT +) class DocumentTestCase(TestCase): diff --git a/mayan/apps/ocr/classes.py b/mayan/apps/ocr/classes.py index 0f5a4471e7..13a5eaf499 100644 --- a/mayan/apps/ocr/classes.py +++ b/mayan/apps/ocr/classes.py @@ -2,13 +2,47 @@ from __future__ import unicode_literals import logging +from django.utils.module_loading import import_string + from converter import converter_class +from .exceptions import NoMIMETypeMatch, ParserError from .models import DocumentPageContent +from .parsers import Parser +from .settings import setting_ocr_backend logger = logging.getLogger(__name__) +class TextExtractor(object): + @classmethod + def perform_ocr(cls, document_page): + ocr_backend_class = import_string(setting_ocr_backend.value) + backend = ocr_backend_class() + backend.process_document_page(document_page) + + @classmethod + def process_document_page(cls, document_page): + """ + Extract text for a document version's page. Try parsing the page and if + no there are not parsers for the MIME type or the parser return nothing + fallback to doing and OCR of the page. + """ + + try: + Parser.parse_document_page(document_page=document_page) + except (NoMIMETypeMatch, ParserError): + cls.perform_ocr(document_page=document_page) + else: + if not document_page.ocr_content.content: + cls.perform_ocr(document_page=document_page) + + @classmethod + def process_document_version(cls, document_version): + for document_page in document_version.pages.all(): + cls.process_document_page(document_page=document_page) + + class OCRBackendBase(object): def process_document_version(self, document_version): logger.info('Starting OCR for document version: %s', document_version) @@ -17,9 +51,9 @@ class OCRBackendBase(object): language = document_version.document.language for document_page in document_version.pages.all(): - self.process_document_page(document_page=document_page, language=language) + self.process_document_page(document_page=document_page) - def process_document_page(self, document_page, language=None): + def process_document_page(self, document_page): logger.info( 'Processing page: %d of document version: %s', document_page.page_number, document_page.document_version @@ -32,7 +66,7 @@ class OCRBackendBase(object): document_page=document_page ) document_page_content.content = self.execute( - file_object=image, language=language + file_object=image, language=document_page.document.language ) document_page_content.save() finally: diff --git a/mayan/apps/ocr/parsers.py b/mayan/apps/ocr/parsers.py index c8285499ef..deab577b7d 100644 --- a/mayan/apps/ocr/parsers.py +++ b/mayan/apps/ocr/parsers.py @@ -38,7 +38,7 @@ class Parser(object): ).append(parser_class) @classmethod - def process_document_version(cls, document_version): + def parse_document_version(cls, document_version): try: for parser_class in cls._registry[document_version.mimetype]: try: @@ -56,6 +56,24 @@ class Parser(object): except KeyError: raise NoMIMETypeMatch + @classmethod + def parse_document_page(cls, document_page): + try: + for parser_class in cls._registry[document_page.document_version.mimetype]: + try: + parser = parser_class() + parser.process_document_page(document_page) + except ParserError: + # If parser raises error, try next parser in the list + pass + else: + # If parser was successfull there is no need to try + # others in the list for this mimetype + return + raise NoMIMETypeMatch('Parser MIME type list exhausted') + except KeyError: + raise NoMIMETypeMatch + def process_document_version(self, document_version): logger.info('Starting parsing for document version: %s', document_version) logger.debug('document version: %d', document_version.pk) @@ -139,10 +157,14 @@ class PopplerParser(Parser): raise ParserError output = proc.stdout.read() + if output == b'\x0c': logger.debug('Parser didn\'t return any output') return '' + if output[-3:] == b'\x0a\x0a\x0c': + return output[:-3] + return output diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py deleted file mode 100644 index afe052ad29..0000000000 --- a/mayan/apps/ocr/runtime.py +++ /dev/null @@ -1,5 +0,0 @@ -from django.utils.module_loading import import_string - -from .settings import setting_ocr_backend - -ocr_backend_class = import_string(setting_ocr_backend.value) diff --git a/mayan/apps/ocr/tasks.py b/mayan/apps/ocr/tasks.py index 631833422a..0980faab79 100644 --- a/mayan/apps/ocr/tasks.py +++ b/mayan/apps/ocr/tasks.py @@ -11,7 +11,7 @@ from documents.models import DocumentVersion from lock_manager import Lock, LockError from mayan.celery import app -from .runtime import ocr_backend_class +from .classes import TextExtractor from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE from .models import DocumentVersionOCRError from .signals import post_document_version_ocr @@ -35,8 +35,7 @@ def task_do_ocr(self, document_version_pk): 'Starting document OCR for document version: %s', document_version ) - backend = ocr_backend_class() - backend.process_document_version(document_version) + TextExtractor.process_document_version(document_version) except OperationalError as exception: logger.warning( 'OCR error for document version: %s; %s. Retrying.', diff --git a/mayan/apps/ocr/test_models.py b/mayan/apps/ocr/test_models.py index def2d78150..14525af663 100644 --- a/mayan/apps/ocr/test_models.py +++ b/mayan/apps/ocr/test_models.py @@ -1,4 +1,5 @@ # -*- coding: utf-8 -*- + from __future__ import unicode_literals from django.core.files.base import File @@ -19,7 +20,7 @@ class DocumentOCRTestCase(TestCase): with open(TEST_SMALL_DOCUMENT_PATH) as file_object: self.document = self.document_type.new_document( - file_object=File(file_object), label='small document' + file_object=File(file_object), ) def tearDown(self): @@ -27,8 +28,10 @@ class DocumentOCRTestCase(TestCase): self.document_type.delete() def test_ocr_language_backends_end(self): + content = self.document.pages.first().ocr_content.content + self.assertTrue( - 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content + 'Mayan EDMS Documentation' in content ) @@ -56,9 +59,11 @@ class GermanOCRSupportTestCase(TestCase): self.document_type.delete() def test_ocr_language_backends_end(self): + content = self.document.pages.first().ocr_content.content + self.assertTrue( - 'Repository für elektronische Dokumente.' in self.document.pages.first().ocr_content.content + 'Repository für elektronische Dokumente.' in content ) self.assertTrue( - 'Es bietet einen elektronischen Tresor oder' in self.document.pages.first().ocr_content.content + 'Es bietet einen elektronischen Tresor oder' in content ) diff --git a/mayan/apps/ocr/test_parsers.py b/mayan/apps/ocr/test_parsers.py index 3dbd97164b..813ac339dd 100644 --- a/mayan/apps/ocr/test_parsers.py +++ b/mayan/apps/ocr/test_parsers.py @@ -1,12 +1,14 @@ -# -*- coding: utf-8 -*- from __future__ import unicode_literals from django.core.files.base import File from django.test import TestCase from documents.models import DocumentType -from documents.test_models import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE +from documents.test_models import ( + TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_HYBRID_DOCUMENT_PATH +) +from .classes import TextExtractor from .parsers import PDFMinerParser, PopplerParser @@ -46,3 +48,38 @@ class ParserTestCase(TestCase): self.assertTrue( 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content ) + + +class TextExtractorTestCase(TestCase): + def setUp(self): + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE + ) + + ocr_settings = self.document_type.ocr_settings + ocr_settings.auto_ocr = False + ocr_settings.save() + + with open(TEST_HYBRID_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=File(file_object) + ) + + def tearDown(self): + self.document.delete() + self.document_type.delete() + + def test_text_extractor(self): + TextExtractor.process_document_version( + document_version=self.document.latest_version + ) + + self.assertEqual( + self.document.latest_version.pages.first().ocr_content.content, + 'Sample text', + ) + + self.assertEqual( + self.document.latest_version.pages.last().ocr_content.content, + 'Sample text in image form', + )