Refactor OCR app. Removes document parsing. Moves OCR processing to
model manager. Add submit and finish events. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -1,80 +1,9 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
from converter import converter_class
|
||||
from documents.runtime import cache_storage_backend
|
||||
|
||||
from .exceptions import NoMIMETypeMatch, ParserError
|
||||
from .models import DocumentPageContent
|
||||
from .parsers import Parser
|
||||
from .settings import setting_ocr_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextExtractor(object):
|
||||
@classmethod
|
||||
def perform_ocr(cls, document_page):
|
||||
ocr_backend_class = import_string(setting_ocr_backend.value)
|
||||
backend = ocr_backend_class()
|
||||
backend.process_document_page(document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_page(cls, document_page):
|
||||
"""
|
||||
Extract text for a document version's page. Try parsing the page and if
|
||||
no there are not parsers for the MIME type or the parser return nothing
|
||||
fallback to doing and OCR of the page.
|
||||
"""
|
||||
|
||||
try:
|
||||
Parser.parse_document_page(document_page=document_page)
|
||||
except (NoMIMETypeMatch, ParserError):
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
else:
|
||||
if not document_page.ocr_content.content:
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_version(cls, document_version):
|
||||
for document_page in document_version.pages.all():
|
||||
cls.process_document_page(document_page=document_page)
|
||||
|
||||
|
||||
class OCRBackendBase(object):
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting OCR for document version: %s', document_version)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
cache_filename = document_page.generate_image()
|
||||
|
||||
with cache_storage_backend.open(cache_filename) as file_object:
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=file_object,
|
||||
language=document_page.document.language
|
||||
)
|
||||
document_page_content.save()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
def execute(self, file_object, language=None, transformations=None):
|
||||
self.language = language
|
||||
|
||||
|
||||
Reference in New Issue
Block a user