diff --git a/mayan/apps/ocr/backends/__init__.py b/mayan/apps/ocr/backends/__init__.py index 6558a75c85..8b13789179 100644 --- a/mayan/apps/ocr/backends/__init__.py +++ b/mayan/apps/ocr/backends/__init__.py @@ -1,3 +1 @@ -class BackendBase(object): - def execute(self, input_filename, language=None): - raise NotImplementedError + diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py index e36b4c043a..ba70de6a91 100644 --- a/mayan/apps/ocr/backends/tesseract.py +++ b/mayan/apps/ocr/backends/tesseract.py @@ -1,55 +1,41 @@ from __future__ import unicode_literals +try: + from cStringIO import StringIO +except ImportError: + from StringIO import StringIO + import codecs import errno +import logging import os -import subprocess import tempfile +from PIL import Image, ImageFilter +import pytesseract + from common.utils import fs_cleanup -from . import BackendBase +from ..classes import OCRBackendBase from ..exceptions import OCRError from ..settings import TESSERACT_PATH +logger = logging.getLogger(__name__) -class Tesseract(BackendBase): - def execute(self, input_filename, language=None): + +class Tesseract(OCRBackendBase): + def execute(self, *args, **kwargs): """ Execute the command line binary of tesseract """ - fd, filepath = tempfile.mkstemp() - os.close(fd) - ocr_output = os.extsep.join([filepath, 'txt']) - command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] - - if language is not None: - command.extend(['-l', language]) + super(Tesseract, self).execute(*args, **kwargs) + image = Image.open(self.converter.get_page()) try: - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - except OSError as exception: - if exception.errno == errno.ENOENT: - raise OCRError('Tesseract not found at %s' % TESSERACT_PATH) - else: - raise - else: - return_code = proc.wait() - if return_code != 0: - error_text = proc.stderr.read() - fs_cleanup(filepath) - fs_cleanup(ocr_output) - if language: - # If tesseract gives an error with a language parameter - # re-run it with no parameter again - return self.execute(input_filename, language=None) - else: - raise OCRError(error_text) + result = pytesseract.image_to_string(image=image, lang=self.language) + # If tesseract gives an error with a language parameter + # re-run it with no language parameter + except: + result = pytesseract.image_to_string(image=image) - fd = codecs.open(ocr_output, 'r', 'utf-8') - text = fd.read().strip() - fd.close() - - os.unlink(filepath) - - return text + return result diff --git a/mayan/apps/ocr/classes.py b/mayan/apps/ocr/classes.py new file mode 100644 index 0000000000..34abad8561 --- /dev/null +++ b/mayan/apps/ocr/classes.py @@ -0,0 +1,50 @@ +from __future__ import unicode_literals + +import logging +import os +import tempfile + +import sh + +from django.utils.module_loading import import_string +from django.utils.translation import ugettext_lazy as _ + +from common.settings import TEMPORARY_DIRECTORY +from common.utils import fs_cleanup +from converter import converter_class +from documents.models import DocumentPage + +from .exceptions import UnpaperError +from .literals import ( + DEFAULT_OCR_FILE_EXTENSION, DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT +) +from .parsers import parse_document_page +from .parsers.exceptions import ParserError, ParserUnknownFile +from .settings import UNPAPER_PATH + +logger = logging.getLogger(__name__) + + +class OCRBackendBase(object): + def process_document_version(self, document_version): + logger.info('Starting OCR for document version: %s', document_version) + logger.debug('document version: %d', document_version.pk) + + language = document_version.document.language + + for page in document_version.pages.all(): + image = page.get_image() + logger.info('Processing page: %d', page.page_number) + page.content = self.execute(file_object=image, language=language) + page.save() + image.close() + logger.info('Finished processing page: %d', page.page_number) + + def execute(self, file_object, language=None, transformations=None): + if not transformations: + transformations = [] + + self.converter = converter_class(file_object=file_object) + + for transformation in transformations: + self.converter.transform(transformation=transformation) diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py index b7b8f23d64..c8a0ac40ec 100644 --- a/mayan/apps/ocr/runtime.py +++ b/mayan/apps/ocr/runtime.py @@ -2,4 +2,4 @@ from django.utils.module_loading import import_string from .settings import BACKEND -ocr_backend = import_string(BACKEND)() +ocr_backend_class = import_string(BACKEND) diff --git a/mayan/apps/ocr/tasks.py b/mayan/apps/ocr/tasks.py index 59b877c94d..ef1791a7e2 100644 --- a/mayan/apps/ocr/tasks.py +++ b/mayan/apps/ocr/tasks.py @@ -10,7 +10,7 @@ from documents.models import DocumentVersion from lock_manager import Lock, LockError from mayan.celery import app -from .api import do_document_ocr +from .runtime import ocr_backend_class from .literals import LOCK_EXPIRE from .models import DocumentVersionOCRError from .signals import post_document_version_ocr @@ -29,11 +29,12 @@ def task_do_ocr(self, document_version_pk): logger.debug('acquired lock: %s', lock_id) document_version = None try: - logger.info('Starting document OCR for document version: %d', document_version_pk) document_version = DocumentVersion.objects.get(pk=document_version_pk) - do_document_ocr(document_version) + logger.info('Starting document OCR for document version: %s', document_version) + backend = ocr_backend_class() + backend.process_document_version(document_version) except Exception as exception: - logger.error('OCR error for document version: %d; %s', document_version_pk, exception) + logger.error('OCR error for document version: %s; %s', document_version, exception) if document_version: entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version) @@ -48,7 +49,7 @@ def task_do_ocr(self, document_version_pk): entry.save() else: - logger.info('OCR for document: %d ended', document_version_pk) + logger.info('OCR complete for document version: %s', document_version) try: entry = DocumentVersionOCRError.objects.get(document_version=document_version) except DocumentVersionOCRError.DoesNotExist: diff --git a/requirements/common.txt b/requirements/common.txt index 4bd1b3376c..c47564e403 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -21,6 +21,7 @@ djangorestframework==2.4.4 pdfminer==20110227 pycountry==1.10 +pytesseract==0.1.6 python-dateutil==2.4.2 python-gnupg==0.3.7 python-magic==0.4.6