diff --git a/HISTORY.rst b/HISTORY.rst index 5d7b8a9f52..e259526c82 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -1,7 +1,7 @@ 2.1.1 (2016-05-17) ================== - Fix navigation issue that make it impossible to add new sources. GitLab issue #288. - +- The Tesseract OCR backend now reports if the requested language file is missing. GitLab issue #289. 2.1 (2016-05-14) ================ diff --git a/docs/releases/2.1.1.rst b/docs/releases/2.1.1.rst index 1a13ef1b42..e5268e6550 100644 --- a/docs/releases/2.1.1.rst +++ b/docs/releases/2.1.1.rst @@ -16,7 +16,10 @@ queryset generated using the .defer() or .only() Django filter optimization features to resolve to their parent class transparently. This optimization caused problems with the sources app which uses a - +Missing Tesseract language files +-------------------------------- +The Tesseract OCR backend now reports if the tesseract language file is missing +for the requested document's language. Removals -------- @@ -72,6 +75,7 @@ Bugs fixed or issues closed =========================== * `GitLab issue #288 `_ Can't add sources in mayan-edms 2.1. +* `GitLab issue #289 `_ OCR fails with Exception. .. _PyPI: https://pypi.python.org/pypi/mayan-edms/ diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py index 45fd9d152c..b157787928 100644 --- a/mayan/apps/ocr/backends/tesseract.py +++ b/mayan/apps/ocr/backends/tesseract.py @@ -2,6 +2,8 @@ from __future__ import unicode_literals import logging +import sh + from PIL import Image import pytesseract @@ -13,6 +15,23 @@ logger = logging.getLogger(__name__) class Tesseract(OCRBackendBase): + def __init__(self, *args, **kwargs): + super(Tesseract, self).__init__(*args, **kwargs) + try: + self.binary = sh.Command(setting_tesseract_path.value) + except sh.CommandNotFound: + self.binary = None + + def get_languages(self): + if self.binary: + result = self.binary(list_langs=True) + + return [ + language for language in result.stderr.split('\n') if language + ] + else: + return () + def execute(self, *args, **kwargs): """ Execute the command line binary of tesseract @@ -29,7 +48,16 @@ class Tesseract(OCRBackendBase): # re-run it with no language parameter except Exception as exception: error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception) + + if self.binary: + if self.language not in self.get_languages(): + error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language) + logger.error(error_message) raise OCRError(error_message) return result + + + +