Text parsers and OCR backends are now used in tandem for each document.

This commit is contained in:
Roberto Rosario
2015-08-08 04:49:08 -04:00
parent cf00ba2c40
commit bec85f38f4
9 changed files with 115 additions and 18 deletions

View File

@@ -2,13 +2,47 @@ from __future__ import unicode_literals
import logging
from django.utils.module_loading import import_string
from converter import converter_class
from .exceptions import NoMIMETypeMatch, ParserError
from .models import DocumentPageContent
from .parsers import Parser
from .settings import setting_ocr_backend
logger = logging.getLogger(__name__)
class TextExtractor(object):
@classmethod
def perform_ocr(cls, document_page):
ocr_backend_class = import_string(setting_ocr_backend.value)
backend = ocr_backend_class()
backend.process_document_page(document_page)
@classmethod
def process_document_page(cls, document_page):
"""
Extract text for a document version's page. Try parsing the page and if
no there are not parsers for the MIME type or the parser return nothing
fallback to doing and OCR of the page.
"""
try:
Parser.parse_document_page(document_page=document_page)
except (NoMIMETypeMatch, ParserError):
cls.perform_ocr(document_page=document_page)
else:
if not document_page.ocr_content.content:
cls.perform_ocr(document_page=document_page)
@classmethod
def process_document_version(cls, document_version):
for document_page in document_version.pages.all():
cls.process_document_page(document_page=document_page)
class OCRBackendBase(object):
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
@@ -17,9 +51,9 @@ class OCRBackendBase(object):
language = document_version.document.language
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page, language=language)
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page, language=None):
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
@@ -32,7 +66,7 @@ class OCRBackendBase(object):
document_page=document_page
)
document_page_content.content = self.execute(
file_object=image, language=language
file_object=image, language=document_page.document.language
)
document_page_content.save()
finally: