From 32cf0a0595cf4422d020bccba8696667dcc05070 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sat, 27 Apr 2019 15:44:09 -0400 Subject: [PATCH] Add new default Tesseract OCR backend This new backend uses a command call to avoid Tesseract bug 1670 (https://github.com/tesseract-ocr/tesseract/issues/1670). Signed-off-by: Roberto Rosario --- HISTORY.rst | 3 + docs/releases/3.2.rst | 3 + mayan/apps/ocr/backends/literals.py | 4 + mayan/apps/ocr/backends/tesseract.py | 119 +++++++++++++++++++++++++++ mayan/apps/ocr/settings.py | 3 +- 5 files changed, 131 insertions(+), 1 deletion(-) create mode 100644 mayan/apps/ocr/backends/literals.py create mode 100644 mayan/apps/ocr/backends/tesseract.py diff --git a/HISTORY.rst b/HISTORY.rst index a74634addf..89e53aee26 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -176,6 +176,9 @@ * Remove app top level star imports. * Monkeypatch group and user models to make their fields translatable. +* Add new and default Tesseract OCR backend to avoid + Tesseract bug 1670 + (https://github.com/tesseract-ocr/tesseract/issues/1670) 3.1.11 (2019-04-XX) =================== diff --git a/docs/releases/3.2.rst b/docs/releases/3.2.rst index 3e72b1550e..7bccfb6467 100644 --- a/docs/releases/3.2.rst +++ b/docs/releases/3.2.rst @@ -208,6 +208,9 @@ Other changes * Remove app top level star imports. * Monkeypatch group and user models to make their fields translatable. +* Add new and default Tesseract OCR backend to avoid + Tesseract bug 1670 + (https://github.com/tesseract-ocr/tesseract/issues/1670) Removals -------- diff --git a/mayan/apps/ocr/backends/literals.py b/mayan/apps/ocr/backends/literals.py new file mode 100644 index 0000000000..7958188987 --- /dev/null +++ b/mayan/apps/ocr/backends/literals.py @@ -0,0 +1,4 @@ +from __future__ import absolute_import, unicode_literals + +DEFAULT_TESSERACT_BINARY_PATH = '/usr/bin/tesseract' +DEFAULT_TESSERACT_TIMEOUT = 600 # 600 seconds, 10 minutes diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py new file mode 100644 index 0000000000..3fb2be5411 --- /dev/null +++ b/mayan/apps/ocr/backends/tesseract.py @@ -0,0 +1,119 @@ +from __future__ import absolute_import, unicode_literals + +import logging +import shutil + +import sh +import yaml +try: + from yaml import CSafeLoader as SafeLoader +except ImportError: + from yaml import SafeLoader + +from django.utils.encoding import force_text +from django.utils.translation import ugettext_lazy as _ + +from mayan.apps.storage.utils import TemporaryFile + +from ..classes import OCRBackendBase +from ..exceptions import OCRError +from ..settings import setting_ocr_backend_arguments + +from .literals import DEFAULT_TESSERACT_BINARY_PATH, DEFAULT_TESSERACT_TIMEOUT + +logger = logging.getLogger(__name__) + + +class Tesseract(OCRBackendBase): + def __init__(self, *args, **kwargs): + super(Tesseract, self).__init__(*args, **kwargs) + self.languages = () + + backend_arguments = yaml.load( + Loader=SafeLoader, + stream=setting_ocr_backend_arguments.value or '{}', + ) + + tesseract_binary_path = backend_arguments.get( + 'tesseract_path', DEFAULT_TESSERACT_BINARY_PATH + ) + self.command_timeout = backend_arguments.get( + 'timeout', DEFAULT_TESSERACT_TIMEOUT + ) + + try: + self.command_tesseract = sh.Command(path=tesseract_binary_path) + except sh.CommandNotError: + self.command_tesseract = None + raise OCRError( + _('Tesseract not found.') + ) + else: + # Get version + result = self.command_tesseract(v=True) + logger.debug('Tesseract version: %s', result.stdout) + + # Get languages + result = self.command_tesseract(list_langs=True) + # Sample output format + # List of available languages (3): + # deu + # eng + # osd + # <- empty line + + # Extaction: strip last line, split by newline, discard the first + # line + self.languages = force_text(result.stdout).strip().split('\n')[1:] + + logger.debug('Available languages: %s', ', '.join(self.languages)) + + def execute(self, *args, **kwargs): + """ + Execute the command line binary of tesseract + """ + super(Tesseract, self).execute(*args, **kwargs) + + if self.command_tesseract: + image = self.converter.get_page() + + try: + temporary_image_file = TemporaryFile() + shutil.copyfileobj(image, temporary_image_file) + temporary_image_file.seek(0) + + arguments = ['-', '-'] + + keyword_arguments = { + '_in': temporary_image_file, + '_timeout': self.command_timeout + } + + if self.language: + keyword_arguments['l'] = self.language + + try: + + result = self.command_tesseract( + *arguments, **keyword_arguments + ) + return force_text(result.stdout) + except Exception as exception: + error_message = ( + 'Exception calling Tesseract with language option: {}; {}' + ).format(self.language, exception) + + if self.language not in self.languages: + error_message = ( + '{}\nThe requested OCR language "{}" is not ' + 'available and needs to be installed.\n' + ).format( + error_message, self.language + ) + + logger.error(error_message) + raise OCRError(error_message) + else: + return result + finally: + temporary_image_file.close() diff --git a/mayan/apps/ocr/settings.py b/mayan/apps/ocr/settings.py index b13ad70111..f2aa1052a3 100644 --- a/mayan/apps/ocr/settings.py +++ b/mayan/apps/ocr/settings.py @@ -7,7 +7,8 @@ from mayan.apps.smart_settings import Namespace namespace = Namespace(label=_('OCR'), name='ocr') setting_ocr_backend = namespace.add_setting( - global_name='OCR_BACKEND', default='mayan.apps.ocr.backends.pyocr.PyOCR', + global_name='OCR_BACKEND', + default='mayan.apps.ocr.backends.tesseract.Tesseract', help_text=_('Full path to the backend to be used to do OCR.') ) setting_ocr_backend_arguments = namespace.add_setting(