Refactor OCR backend class to be file object based and use images from document page not the actual file. Use pytesseract instead of calling the CLI directly.

This commit is contained in:
Roberto Rosario
2015-06-09 03:27:02 -04:00
parent 931bdfd113
commit 5275061f9f
6 changed files with 81 additions and 45 deletions

View File

@@ -1,3 +1 @@
class BackendBase(object):
def execute(self, input_filename, language=None):
raise NotImplementedError

View File

@@ -1,55 +1,41 @@
from __future__ import unicode_literals
try:
from cStringIO import StringIO
except ImportError:
from StringIO import StringIO
import codecs
import errno
import logging
import os
import subprocess
import tempfile
from PIL import Image, ImageFilter
import pytesseract
from common.utils import fs_cleanup
from . import BackendBase
from ..classes import OCRBackendBase
from ..exceptions import OCRError
from ..settings import TESSERACT_PATH
logger = logging.getLogger(__name__)
class Tesseract(BackendBase):
def execute(self, input_filename, language=None):
class Tesseract(OCRBackendBase):
def execute(self, *args, **kwargs):
"""
Execute the command line binary of tesseract
"""
fd, filepath = tempfile.mkstemp()
os.close(fd)
ocr_output = os.extsep.join([filepath, 'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if language is not None:
command.extend(['-l', language])
super(Tesseract, self).execute(*args, **kwargs)
image = Image.open(self.converter.get_page())
try:
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
except OSError as exception:
if exception.errno == errno.ENOENT:
raise OCRError('Tesseract not found at %s' % TESSERACT_PATH)
else:
raise
else:
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
fs_cleanup(filepath)
fs_cleanup(ocr_output)
if language:
# If tesseract gives an error with a language parameter
# re-run it with no parameter again
return self.execute(input_filename, language=None)
else:
raise OCRError(error_text)
result = pytesseract.image_to_string(image=image, lang=self.language)
# If tesseract gives an error with a language parameter
# re-run it with no language parameter
except:
result = pytesseract.image_to_string(image=image)
fd = codecs.open(ocr_output, 'r', 'utf-8')
text = fd.read().strip()
fd.close()
os.unlink(filepath)
return text
return result

50
mayan/apps/ocr/classes.py Normal file
View File

@@ -0,0 +1,50 @@
from __future__ import unicode_literals
import logging
import os
import tempfile
import sh
from django.utils.module_loading import import_string
from django.utils.translation import ugettext_lazy as _
from common.settings import TEMPORARY_DIRECTORY
from common.utils import fs_cleanup
from converter import converter_class
from documents.models import DocumentPage
from .exceptions import UnpaperError
from .literals import (
DEFAULT_OCR_FILE_EXTENSION, DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT
)
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile
from .settings import UNPAPER_PATH
logger = logging.getLogger(__name__)
class OCRBackendBase(object):
def process_document_version(self, document_version):
logger.info('Starting OCR for document version: %s', document_version)
logger.debug('document version: %d', document_version.pk)
language = document_version.document.language
for page in document_version.pages.all():
image = page.get_image()
logger.info('Processing page: %d', page.page_number)
page.content = self.execute(file_object=image, language=language)
page.save()
image.close()
logger.info('Finished processing page: %d', page.page_number)
def execute(self, file_object, language=None, transformations=None):
if not transformations:
transformations = []
self.converter = converter_class(file_object=file_object)
for transformation in transformations:
self.converter.transform(transformation=transformation)

View File

@@ -2,4 +2,4 @@ from django.utils.module_loading import import_string
from .settings import BACKEND
ocr_backend = import_string(BACKEND)()
ocr_backend_class = import_string(BACKEND)

View File

@@ -10,7 +10,7 @@ from documents.models import DocumentVersion
from lock_manager import Lock, LockError
from mayan.celery import app
from .api import do_document_ocr
from .runtime import ocr_backend_class
from .literals import LOCK_EXPIRE
from .models import DocumentVersionOCRError
from .signals import post_document_version_ocr
@@ -29,11 +29,12 @@ def task_do_ocr(self, document_version_pk):
logger.debug('acquired lock: %s', lock_id)
document_version = None
try:
logger.info('Starting document OCR for document version: %d', document_version_pk)
document_version = DocumentVersion.objects.get(pk=document_version_pk)
do_document_ocr(document_version)
logger.info('Starting document OCR for document version: %s', document_version)
backend = ocr_backend_class()
backend.process_document_version(document_version)
except Exception as exception:
logger.error('OCR error for document version: %d; %s', document_version_pk, exception)
logger.error('OCR error for document version: %s; %s', document_version, exception)
if document_version:
entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version)
@@ -48,7 +49,7 @@ def task_do_ocr(self, document_version_pk):
entry.save()
else:
logger.info('OCR for document: %d ended', document_version_pk)
logger.info('OCR complete for document version: %s', document_version)
try:
entry = DocumentVersionOCRError.objects.get(document_version=document_version)
except DocumentVersionOCRError.DoesNotExist:

View File

@@ -21,6 +21,7 @@ djangorestframework==2.4.4
pdfminer==20110227
pycountry==1.10
pytesseract==0.1.6
python-dateutil==2.4.2
python-gnupg==0.3.7
python-magic==0.4.6