Text parsers and OCR backends are now used in tandem for each document.
This commit is contained in:
BIN
contrib/sample_documents/hybrid_text_and_image.pdf
Normal file
BIN
contrib/sample_documents/hybrid_text_and_image.pdf
Normal file
Binary file not shown.
@@ -77,6 +77,7 @@ What's new in Mayan EDMS v2.0
|
||||
* Support to share an index as a FUSE filesystem.
|
||||
* Preview images' titles are clickable.
|
||||
* Improved API
|
||||
* Text parsers and OCR backend are used in tandem.
|
||||
|
||||
Upgrading from a previous version
|
||||
=================================
|
||||
|
||||
@@ -48,6 +48,10 @@ TEST_COMPRESSED_DOCUMENT_PATH = os.path.join(
|
||||
)
|
||||
TEST_DOCUMENT_DESCRIPTION = 'test description'
|
||||
TEST_DOCUMENT_TYPE = 'test_document_type'
|
||||
TEST_HYBRID_DOCUMENT = 'hybrid_text_and_image.pdf'
|
||||
TEST_HYBRID_DOCUMENT_PATH = os.path.join(
|
||||
'contrib', 'sample_documents', TEST_HYBRID_DOCUMENT
|
||||
)
|
||||
|
||||
|
||||
class DocumentTestCase(TestCase):
|
||||
|
||||
@@ -2,13 +2,47 @@ from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
from converter import converter_class
|
||||
|
||||
from .exceptions import NoMIMETypeMatch, ParserError
|
||||
from .models import DocumentPageContent
|
||||
from .parsers import Parser
|
||||
from .settings import setting_ocr_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextExtractor(object):
|
||||
@classmethod
|
||||
def perform_ocr(cls, document_page):
|
||||
ocr_backend_class = import_string(setting_ocr_backend.value)
|
||||
backend = ocr_backend_class()
|
||||
backend.process_document_page(document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_page(cls, document_page):
|
||||
"""
|
||||
Extract text for a document version's page. Try parsing the page and if
|
||||
no there are not parsers for the MIME type or the parser return nothing
|
||||
fallback to doing and OCR of the page.
|
||||
"""
|
||||
|
||||
try:
|
||||
Parser.parse_document_page(document_page=document_page)
|
||||
except (NoMIMETypeMatch, ParserError):
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
else:
|
||||
if not document_page.ocr_content.content:
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_version(cls, document_version):
|
||||
for document_page in document_version.pages.all():
|
||||
cls.process_document_page(document_page=document_page)
|
||||
|
||||
|
||||
class OCRBackendBase(object):
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting OCR for document version: %s', document_version)
|
||||
@@ -17,9 +51,9 @@ class OCRBackendBase(object):
|
||||
language = document_version.document.language
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page, language=language)
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page, language=None):
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
@@ -32,7 +66,7 @@ class OCRBackendBase(object):
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=image, language=language
|
||||
file_object=image, language=document_page.document.language
|
||||
)
|
||||
document_page_content.save()
|
||||
finally:
|
||||
|
||||
@@ -38,7 +38,7 @@ class Parser(object):
|
||||
).append(parser_class)
|
||||
|
||||
@classmethod
|
||||
def process_document_version(cls, document_version):
|
||||
def parse_document_version(cls, document_version):
|
||||
try:
|
||||
for parser_class in cls._registry[document_version.mimetype]:
|
||||
try:
|
||||
@@ -56,6 +56,24 @@ class Parser(object):
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
@classmethod
|
||||
def parse_document_page(cls, document_page):
|
||||
try:
|
||||
for parser_class in cls._registry[document_page.document_version.mimetype]:
|
||||
try:
|
||||
parser = parser_class()
|
||||
parser.process_document_page(document_page)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting parsing for document version: %s', document_version)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
@@ -139,10 +157,14 @@ class PopplerParser(Parser):
|
||||
raise ParserError
|
||||
|
||||
output = proc.stdout.read()
|
||||
|
||||
if output == b'\x0c':
|
||||
logger.debug('Parser didn\'t return any output')
|
||||
return ''
|
||||
|
||||
if output[-3:] == b'\x0a\x0a\x0c':
|
||||
return output[:-3]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +0,0 @@
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
from .settings import setting_ocr_backend
|
||||
|
||||
ocr_backend_class = import_string(setting_ocr_backend.value)
|
||||
@@ -11,7 +11,7 @@ from documents.models import DocumentVersion
|
||||
from lock_manager import Lock, LockError
|
||||
from mayan.celery import app
|
||||
|
||||
from .runtime import ocr_backend_class
|
||||
from .classes import TextExtractor
|
||||
from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
|
||||
from .models import DocumentVersionOCRError
|
||||
from .signals import post_document_version_ocr
|
||||
@@ -35,8 +35,7 @@ def task_do_ocr(self, document_version_pk):
|
||||
'Starting document OCR for document version: %s',
|
||||
document_version
|
||||
)
|
||||
backend = ocr_backend_class()
|
||||
backend.process_document_version(document_version)
|
||||
TextExtractor.process_document_version(document_version)
|
||||
except OperationalError as exception:
|
||||
logger.warning(
|
||||
'OCR error for document version: %s; %s. Retrying.',
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
@@ -19,7 +20,7 @@ class DocumentOCRTestCase(TestCase):
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object), label='small document'
|
||||
file_object=File(file_object),
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
@@ -27,8 +28,10 @@ class DocumentOCRTestCase(TestCase):
|
||||
self.document_type.delete()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
'Mayan EDMS Documentation' in content
|
||||
)
|
||||
|
||||
|
||||
@@ -56,9 +59,11 @@ class GermanOCRSupportTestCase(TestCase):
|
||||
self.document_type.delete()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue(
|
||||
'Repository für elektronische Dokumente.' in self.document.pages.first().ocr_content.content
|
||||
'Repository für elektronische Dokumente.' in content
|
||||
)
|
||||
self.assertTrue(
|
||||
'Es bietet einen elektronischen Tresor oder' in self.document.pages.first().ocr_content.content
|
||||
'Es bietet einen elektronischen Tresor oder' in content
|
||||
)
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
from django.test import TestCase
|
||||
|
||||
from documents.models import DocumentType
|
||||
from documents.test_models import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE
|
||||
from documents.test_models import (
|
||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_HYBRID_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
from .classes import TextExtractor
|
||||
from .parsers import PDFMinerParser, PopplerParser
|
||||
|
||||
|
||||
@@ -46,3 +48,38 @@ class ParserTestCase(TestCase):
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
|
||||
class TextExtractorTestCase(TestCase):
|
||||
def setUp(self):
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE
|
||||
)
|
||||
|
||||
ocr_settings = self.document_type.ocr_settings
|
||||
ocr_settings.auto_ocr = False
|
||||
ocr_settings.save()
|
||||
|
||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document.delete()
|
||||
self.document_type.delete()
|
||||
|
||||
def test_text_extractor(self):
|
||||
TextExtractor.process_document_version(
|
||||
document_version=self.document.latest_version
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.first().ocr_content.content,
|
||||
'Sample text',
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.last().ocr_content.content,
|
||||
'Sample text in image form',
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user