Text parsers and OCR backends are now used in tandem for each document.
This commit is contained in:
@@ -2,13 +2,47 @@ from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
from converter import converter_class
|
||||
|
||||
from .exceptions import NoMIMETypeMatch, ParserError
|
||||
from .models import DocumentPageContent
|
||||
from .parsers import Parser
|
||||
from .settings import setting_ocr_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextExtractor(object):
|
||||
@classmethod
|
||||
def perform_ocr(cls, document_page):
|
||||
ocr_backend_class = import_string(setting_ocr_backend.value)
|
||||
backend = ocr_backend_class()
|
||||
backend.process_document_page(document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_page(cls, document_page):
|
||||
"""
|
||||
Extract text for a document version's page. Try parsing the page and if
|
||||
no there are not parsers for the MIME type or the parser return nothing
|
||||
fallback to doing and OCR of the page.
|
||||
"""
|
||||
|
||||
try:
|
||||
Parser.parse_document_page(document_page=document_page)
|
||||
except (NoMIMETypeMatch, ParserError):
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
else:
|
||||
if not document_page.ocr_content.content:
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_version(cls, document_version):
|
||||
for document_page in document_version.pages.all():
|
||||
cls.process_document_page(document_page=document_page)
|
||||
|
||||
|
||||
class OCRBackendBase(object):
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting OCR for document version: %s', document_version)
|
||||
@@ -17,9 +51,9 @@ class OCRBackendBase(object):
|
||||
language = document_version.document.language
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page, language=language)
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page, language=None):
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
@@ -32,7 +66,7 @@ class OCRBackendBase(object):
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=image, language=language
|
||||
file_object=image, language=document_page.document.language
|
||||
)
|
||||
document_page_content.save()
|
||||
finally:
|
||||
|
||||
Reference in New Issue
Block a user