Add new default Tesseract OCR backend
This new backend uses a command call to avoid Tesseract bug 1670 (https://github.com/tesseract-ocr/tesseract/issues/1670). Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -176,6 +176,9 @@
|
||||
* Remove app top level star imports.
|
||||
* Monkeypatch group and user models to make their fields
|
||||
translatable.
|
||||
* Add new and default Tesseract OCR backend to avoid
|
||||
Tesseract bug 1670
|
||||
(https://github.com/tesseract-ocr/tesseract/issues/1670)
|
||||
|
||||
3.1.11 (2019-04-XX)
|
||||
===================
|
||||
|
||||
@@ -208,6 +208,9 @@ Other changes
|
||||
* Remove app top level star imports.
|
||||
* Monkeypatch group and user models to make their fields
|
||||
translatable.
|
||||
* Add new and default Tesseract OCR backend to avoid
|
||||
Tesseract bug 1670
|
||||
(https://github.com/tesseract-ocr/tesseract/issues/1670)
|
||||
|
||||
Removals
|
||||
--------
|
||||
|
||||
4
mayan/apps/ocr/backends/literals.py
Normal file
4
mayan/apps/ocr/backends/literals.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
DEFAULT_TESSERACT_BINARY_PATH = '/usr/bin/tesseract'
|
||||
DEFAULT_TESSERACT_TIMEOUT = 600 # 600 seconds, 10 minutes
|
||||
119
mayan/apps/ocr/backends/tesseract.py
Normal file
119
mayan/apps/ocr/backends/tesseract.py
Normal file
@@ -0,0 +1,119 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import logging
|
||||
import shutil
|
||||
|
||||
import sh
|
||||
import yaml
|
||||
try:
|
||||
from yaml import CSafeLoader as SafeLoader
|
||||
except ImportError:
|
||||
from yaml import SafeLoader
|
||||
|
||||
from django.utils.encoding import force_text
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from mayan.apps.storage.utils import TemporaryFile
|
||||
|
||||
from ..classes import OCRBackendBase
|
||||
from ..exceptions import OCRError
|
||||
from ..settings import setting_ocr_backend_arguments
|
||||
|
||||
from .literals import DEFAULT_TESSERACT_BINARY_PATH, DEFAULT_TESSERACT_TIMEOUT
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Tesseract(OCRBackendBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Tesseract, self).__init__(*args, **kwargs)
|
||||
self.languages = ()
|
||||
|
||||
backend_arguments = yaml.load(
|
||||
Loader=SafeLoader,
|
||||
stream=setting_ocr_backend_arguments.value or '{}',
|
||||
)
|
||||
|
||||
tesseract_binary_path = backend_arguments.get(
|
||||
'tesseract_path', DEFAULT_TESSERACT_BINARY_PATH
|
||||
)
|
||||
self.command_timeout = backend_arguments.get(
|
||||
'timeout', DEFAULT_TESSERACT_TIMEOUT
|
||||
)
|
||||
|
||||
try:
|
||||
self.command_tesseract = sh.Command(path=tesseract_binary_path)
|
||||
except sh.CommandNotError:
|
||||
self.command_tesseract = None
|
||||
raise OCRError(
|
||||
_('Tesseract not found.')
|
||||
)
|
||||
else:
|
||||
# Get version
|
||||
result = self.command_tesseract(v=True)
|
||||
logger.debug('Tesseract version: %s', result.stdout)
|
||||
|
||||
# Get languages
|
||||
result = self.command_tesseract(list_langs=True)
|
||||
# Sample output format
|
||||
# List of available languages (3):
|
||||
# deu
|
||||
# eng
|
||||
# osd
|
||||
# <- empty line
|
||||
|
||||
# Extaction: strip last line, split by newline, discard the first
|
||||
# line
|
||||
self.languages = force_text(result.stdout).strip().split('\n')[1:]
|
||||
|
||||
logger.debug('Available languages: %s', ', '.join(self.languages))
|
||||
|
||||
def execute(self, *args, **kwargs):
|
||||
"""
|
||||
Execute the command line binary of tesseract
|
||||
"""
|
||||
super(Tesseract, self).execute(*args, **kwargs)
|
||||
|
||||
if self.command_tesseract:
|
||||
image = self.converter.get_page()
|
||||
|
||||
try:
|
||||
temporary_image_file = TemporaryFile()
|
||||
shutil.copyfileobj(image, temporary_image_file)
|
||||
temporary_image_file.seek(0)
|
||||
|
||||
arguments = ['-', '-']
|
||||
|
||||
keyword_arguments = {
|
||||
'_in': temporary_image_file,
|
||||
'_timeout': self.command_timeout
|
||||
}
|
||||
|
||||
if self.language:
|
||||
keyword_arguments['l'] = self.language
|
||||
|
||||
try:
|
||||
|
||||
result = self.command_tesseract(
|
||||
*arguments, **keyword_arguments
|
||||
)
|
||||
return force_text(result.stdout)
|
||||
except Exception as exception:
|
||||
error_message = (
|
||||
'Exception calling Tesseract with language option: {}; {}'
|
||||
).format(self.language, exception)
|
||||
|
||||
if self.language not in self.languages:
|
||||
error_message = (
|
||||
'{}\nThe requested OCR language "{}" is not '
|
||||
'available and needs to be installed.\n'
|
||||
).format(
|
||||
error_message, self.language
|
||||
)
|
||||
|
||||
logger.error(error_message)
|
||||
raise OCRError(error_message)
|
||||
else:
|
||||
return result
|
||||
finally:
|
||||
temporary_image_file.close()
|
||||
@@ -7,7 +7,8 @@ from mayan.apps.smart_settings import Namespace
|
||||
namespace = Namespace(label=_('OCR'), name='ocr')
|
||||
|
||||
setting_ocr_backend = namespace.add_setting(
|
||||
global_name='OCR_BACKEND', default='mayan.apps.ocr.backends.pyocr.PyOCR',
|
||||
global_name='OCR_BACKEND',
|
||||
default='mayan.apps.ocr.backends.tesseract.Tesseract',
|
||||
help_text=_('Full path to the backend to be used to do OCR.')
|
||||
)
|
||||
setting_ocr_backend_arguments = namespace.add_setting(
|
||||
|
||||
Reference in New Issue
Block a user