Text parsers and OCR backends are now used in tandem for each document.

This commit is contained in:
Roberto Rosario
2015-08-08 04:49:08 -04:00
parent cf00ba2c40
commit bec85f38f4
9 changed files with 115 additions and 18 deletions

Binary file not shown.

View File

@@ -77,6 +77,7 @@ What's new in Mayan EDMS v2.0
* Support to share an index as a FUSE filesystem.
* Preview images' titles are clickable.
* Improved API
* Text parsers and OCR backend are used in tandem.
Upgrading from a previous version
=================================

View File

@@ -48,6 +48,10 @@ TEST_COMPRESSED_DOCUMENT_PATH = os.path.join(
)
TEST_DOCUMENT_DESCRIPTION = 'test description'
TEST_DOCUMENT_TYPE = 'test_document_type'
TEST_HYBRID_DOCUMENT = 'hybrid_text_and_image.pdf'
TEST_HYBRID_DOCUMENT_PATH = os.path.join(
'contrib', 'sample_documents', TEST_HYBRID_DOCUMENT
)
class DocumentTestCase(TestCase):

View File

@@ -2,13 +2,47 @@ from __future__ import unicode_literals
import logging
from django.utils.module_loading import import_string
from converter import converter_class
from .exceptions import NoMIMETypeMatch, ParserError
from .models import DocumentPageContent
from .parsers import Parser
from .settings import setting_ocr_backend
logger = logging.getLogger(__name__)
class TextExtractor(object):
@classmethod
def perform_ocr(cls, document_page):
ocr_backend_class = import_string(setting_ocr_backend.value)
backend = ocr_backend_class()
backend.process_document_page(document_page)
@classmethod
def process_document_page(cls, document_page):
"""
Extract text for a document version's page. Try parsing the page and if
no there are not parsers for the MIME type or the parser return nothing
fallback to doing and OCR of the page.
"""
try:
Parser.parse_document_page(document_page=document_page)
except (NoMIMETypeMatch, ParserError):
cls.perform_ocr(document_page=document_page)
else:
if not document_page.ocr_content.content:
cls.perform_ocr(document_page=document_page)
@classmethod
def process_document_version(cls, document_version):
for document_page in document_version.pages.all():
cls.process_document_page(document_page=document_page)
class OCRBackendBase(object):
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
@@ -17,9 +51,9 @@ class OCRBackendBase(object):
language = document_version.document.language
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page, language=language)
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page, language=None):
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
@@ -32,7 +66,7 @@ class OCRBackendBase(object):
document_page=document_page
)
document_page_content.content = self.execute(
file_object=image, language=language
file_object=image, language=document_page.document.language
)
document_page_content.save()
finally:

View File

@@ -38,7 +38,7 @@ class Parser(object):
).append(parser_class)
@classmethod
def process_document_version(cls, document_version):
def parse_document_version(cls, document_version):
try:
for parser_class in cls._registry[document_version.mimetype]:
try:
@@ -56,6 +56,24 @@ class Parser(object):
except KeyError:
raise NoMIMETypeMatch
@classmethod
def parse_document_page(cls, document_page):
try:
for parser_class in cls._registry[document_page.document_version.mimetype]:
try:
parser = parser_class()
parser.process_document_page(document_page)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
def process_document_version(self, document_version):
logger.info('Starting parsing for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
@@ -139,10 +157,14 @@ class PopplerParser(Parser):
raise ParserError
output = proc.stdout.read()
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
if output[-3:] == b'\x0a\x0a\x0c':
return output[:-3]
return output

View File

@@ -1,5 +0,0 @@
from django.utils.module_loading import import_string
from .settings import setting_ocr_backend
ocr_backend_class = import_string(setting_ocr_backend.value)

View File

@@ -11,7 +11,7 @@ from documents.models import DocumentVersion
from lock_manager import Lock, LockError
from mayan.celery import app
from .runtime import ocr_backend_class
from .classes import TextExtractor
from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
from .models import DocumentVersionOCRError
from .signals import post_document_version_ocr
@@ -35,8 +35,7 @@ def task_do_ocr(self, document_version_pk):
'Starting document OCR for document version: %s',
document_version
)
backend = ocr_backend_class()
backend.process_document_version(document_version)
TextExtractor.process_document_version(document_version)
except OperationalError as exception:
logger.warning(
'OCR error for document version: %s; %s. Retrying.',

View File

@@ -1,4 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.core.files.base import File
@@ -19,7 +20,7 @@ class DocumentOCRTestCase(TestCase):
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object), label='small document'
file_object=File(file_object),
)
def tearDown(self):
@@ -27,8 +28,10 @@ class DocumentOCRTestCase(TestCase):
self.document_type.delete()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
'Mayan EDMS Documentation' in content
)
@@ -56,9 +59,11 @@ class GermanOCRSupportTestCase(TestCase):
self.document_type.delete()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue(
'Repository für elektronische Dokumente.' in self.document.pages.first().ocr_content.content
'Repository für elektronische Dokumente.' in content
)
self.assertTrue(
'Es bietet einen elektronischen Tresor oder' in self.document.pages.first().ocr_content.content
'Es bietet einen elektronischen Tresor oder' in content
)

View File

@@ -1,12 +1,14 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.core.files.base import File
from django.test import TestCase
from documents.models import DocumentType
from documents.test_models import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE
from documents.test_models import (
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE, TEST_HYBRID_DOCUMENT_PATH
)
from .classes import TextExtractor
from .parsers import PDFMinerParser, PopplerParser
@@ -46,3 +48,38 @@ class ParserTestCase(TestCase):
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
class TextExtractorTestCase(TestCase):
def setUp(self):
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE
)
ocr_settings = self.document_type.ocr_settings
ocr_settings.auto_ocr = False
ocr_settings.save()
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document.delete()
self.document_type.delete()
def test_text_extractor(self):
TextExtractor.process_document_version(
document_version=self.document.latest_version
)
self.assertEqual(
self.document.latest_version.pages.first().ocr_content.content,
'Sample text',
)
self.assertEqual(
self.document.latest_version.pages.last().ocr_content.content,
'Sample text in image form',
)