Refactor OCR app. Removes document parsing. Moves OCR processing to

model manager. Add submit and finish events.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-23 02:04:57 -04:00
parent 2052caada4
commit 317d07a355
20 changed files with 309 additions and 497 deletions

View File

@@ -1,80 +1,9 @@
from __future__ import unicode_literals
import logging
from django.utils.module_loading import import_string
from converter import converter_class
from documents.runtime import cache_storage_backend
from .exceptions import NoMIMETypeMatch, ParserError
from .models import DocumentPageContent
from .parsers import Parser
from .settings import setting_ocr_backend
logger = logging.getLogger(__name__)
class TextExtractor(object):
@classmethod
def perform_ocr(cls, document_page):
ocr_backend_class = import_string(setting_ocr_backend.value)
backend = ocr_backend_class()
backend.process_document_page(document_page)
@classmethod
def process_document_page(cls, document_page):
"""
Extract text for a document version's page. Try parsing the page and if
no there are not parsers for the MIME type or the parser return nothing
fallback to doing and OCR of the page.
"""
try:
Parser.parse_document_page(document_page=document_page)
except (NoMIMETypeMatch, ParserError):
cls.perform_ocr(document_page=document_page)
else:
if not document_page.ocr_content.content:
cls.perform_ocr(document_page=document_page)
@classmethod
def process_document_version(cls, document_version):
for document_page in document_version.pages.all():
cls.process_document_page(document_page=document_page)
class OCRBackendBase(object):
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
cache_filename = document_page.generate_image()
with cache_storage_backend.open(cache_filename) as file_object:
document_page_content, created = DocumentPageContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = self.execute(
file_object=file_object,
language=document_page.document.language
)
document_page_content.save()
logger.info(
'Finished processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
def execute(self, file_object, language=None, transformations=None):
self.language = language