Refactor OCR backend class to be file object based and use images from document page not the actual file. Use pytesseract instead of calling the CLI directly.
This commit is contained in:
@@ -1,3 +1 @@
|
||||
class BackendBase(object):
|
||||
def execute(self, input_filename, language=None):
|
||||
raise NotImplementedError
|
||||
|
||||
|
||||
@@ -1,55 +1,41 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
try:
|
||||
from cStringIO import StringIO
|
||||
except ImportError:
|
||||
from StringIO import StringIO
|
||||
|
||||
import codecs
|
||||
import errno
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
import tempfile
|
||||
|
||||
from PIL import Image, ImageFilter
|
||||
import pytesseract
|
||||
|
||||
from common.utils import fs_cleanup
|
||||
|
||||
from . import BackendBase
|
||||
from ..classes import OCRBackendBase
|
||||
from ..exceptions import OCRError
|
||||
from ..settings import TESSERACT_PATH
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class Tesseract(BackendBase):
|
||||
def execute(self, input_filename, language=None):
|
||||
|
||||
class Tesseract(OCRBackendBase):
|
||||
def execute(self, *args, **kwargs):
|
||||
"""
|
||||
Execute the command line binary of tesseract
|
||||
"""
|
||||
fd, filepath = tempfile.mkstemp()
|
||||
os.close(fd)
|
||||
ocr_output = os.extsep.join([filepath, 'txt'])
|
||||
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
|
||||
|
||||
if language is not None:
|
||||
command.extend(['-l', language])
|
||||
super(Tesseract, self).execute(*args, **kwargs)
|
||||
|
||||
image = Image.open(self.converter.get_page())
|
||||
try:
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
except OSError as exception:
|
||||
if exception.errno == errno.ENOENT:
|
||||
raise OCRError('Tesseract not found at %s' % TESSERACT_PATH)
|
||||
else:
|
||||
raise
|
||||
else:
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
error_text = proc.stderr.read()
|
||||
fs_cleanup(filepath)
|
||||
fs_cleanup(ocr_output)
|
||||
if language:
|
||||
# If tesseract gives an error with a language parameter
|
||||
# re-run it with no parameter again
|
||||
return self.execute(input_filename, language=None)
|
||||
else:
|
||||
raise OCRError(error_text)
|
||||
result = pytesseract.image_to_string(image=image, lang=self.language)
|
||||
# If tesseract gives an error with a language parameter
|
||||
# re-run it with no language parameter
|
||||
except:
|
||||
result = pytesseract.image_to_string(image=image)
|
||||
|
||||
fd = codecs.open(ocr_output, 'r', 'utf-8')
|
||||
text = fd.read().strip()
|
||||
fd.close()
|
||||
|
||||
os.unlink(filepath)
|
||||
|
||||
return text
|
||||
return result
|
||||
|
||||
50
mayan/apps/ocr/classes.py
Normal file
50
mayan/apps/ocr/classes.py
Normal file
@@ -0,0 +1,50 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
|
||||
import sh
|
||||
|
||||
from django.utils.module_loading import import_string
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.settings import TEMPORARY_DIRECTORY
|
||||
from common.utils import fs_cleanup
|
||||
from converter import converter_class
|
||||
from documents.models import DocumentPage
|
||||
|
||||
from .exceptions import UnpaperError
|
||||
from .literals import (
|
||||
DEFAULT_OCR_FILE_EXTENSION, DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT
|
||||
)
|
||||
from .parsers import parse_document_page
|
||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||
from .settings import UNPAPER_PATH
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class OCRBackendBase(object):
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting OCR for document version: %s', document_version)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
language = document_version.document.language
|
||||
|
||||
for page in document_version.pages.all():
|
||||
image = page.get_image()
|
||||
logger.info('Processing page: %d', page.page_number)
|
||||
page.content = self.execute(file_object=image, language=language)
|
||||
page.save()
|
||||
image.close()
|
||||
logger.info('Finished processing page: %d', page.page_number)
|
||||
|
||||
def execute(self, file_object, language=None, transformations=None):
|
||||
if not transformations:
|
||||
transformations = []
|
||||
|
||||
self.converter = converter_class(file_object=file_object)
|
||||
|
||||
for transformation in transformations:
|
||||
self.converter.transform(transformation=transformation)
|
||||
@@ -2,4 +2,4 @@ from django.utils.module_loading import import_string
|
||||
|
||||
from .settings import BACKEND
|
||||
|
||||
ocr_backend = import_string(BACKEND)()
|
||||
ocr_backend_class = import_string(BACKEND)
|
||||
|
||||
@@ -10,7 +10,7 @@ from documents.models import DocumentVersion
|
||||
from lock_manager import Lock, LockError
|
||||
from mayan.celery import app
|
||||
|
||||
from .api import do_document_ocr
|
||||
from .runtime import ocr_backend_class
|
||||
from .literals import LOCK_EXPIRE
|
||||
from .models import DocumentVersionOCRError
|
||||
from .signals import post_document_version_ocr
|
||||
@@ -29,11 +29,12 @@ def task_do_ocr(self, document_version_pk):
|
||||
logger.debug('acquired lock: %s', lock_id)
|
||||
document_version = None
|
||||
try:
|
||||
logger.info('Starting document OCR for document version: %d', document_version_pk)
|
||||
document_version = DocumentVersion.objects.get(pk=document_version_pk)
|
||||
do_document_ocr(document_version)
|
||||
logger.info('Starting document OCR for document version: %s', document_version)
|
||||
backend = ocr_backend_class()
|
||||
backend.process_document_version(document_version)
|
||||
except Exception as exception:
|
||||
logger.error('OCR error for document version: %d; %s', document_version_pk, exception)
|
||||
logger.error('OCR error for document version: %s; %s', document_version, exception)
|
||||
if document_version:
|
||||
entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version)
|
||||
|
||||
@@ -48,7 +49,7 @@ def task_do_ocr(self, document_version_pk):
|
||||
|
||||
entry.save()
|
||||
else:
|
||||
logger.info('OCR for document: %d ended', document_version_pk)
|
||||
logger.info('OCR complete for document version: %s', document_version)
|
||||
try:
|
||||
entry = DocumentVersionOCRError.objects.get(document_version=document_version)
|
||||
except DocumentVersionOCRError.DoesNotExist:
|
||||
|
||||
@@ -21,6 +21,7 @@ djangorestframework==2.4.4
|
||||
|
||||
pdfminer==20110227
|
||||
pycountry==1.10
|
||||
pytesseract==0.1.6
|
||||
python-dateutil==2.4.2
|
||||
python-gnupg==0.3.7
|
||||
python-magic==0.4.6
|
||||
|
||||
Reference in New Issue
Block a user