From 1cbd54dad799ee3c95d413826d314a80f2371f8d Mon Sep 17 00:00:00 2001 From: Mathias Behrle Date: Thu, 25 Sep 2014 19:31:46 +0200 Subject: [PATCH 1/4] Do not fail, when ocr backend is missing (#50). - Since tesseract (i.e. an ocr backend) is an optional requiremnt of mayan, we shouldn't fail, if it is not found. - S.a https://github.com/mayan-edms/mayan-edms/issues/50 --- mayan/apps/ocr/api.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index 195e65994e..a97e806e8a 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -73,6 +73,9 @@ def do_document_ocr(queue_document): document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u'Text from OCR') document_page.save() + except Exception as e: + logger.debug('missing ocr backend: %s' % ocr_backend) + logger.debug('I/O error({0}): {1}'.format(e.errno, e.strerror)) finally: fs_cleanup(pre_ocr_filepath_w_ext) fs_cleanup(unpaper_input) From 3effc58ab083e9ee91d44fc153eb0d4672906441 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 30 Sep 2014 15:21:21 -0400 Subject: [PATCH 2/4] Fix the way we try for missing binary dependencies in the installations app Issue #50. Thanks to Ford Guo (@fordguo) for the reporting. --- mayan/apps/installation/models.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/mayan/apps/installation/models.py b/mayan/apps/installation/models.py index b04ca35d24..9b8fa6f7ad 100644 --- a/mayan/apps/installation/models.py +++ b/mayan/apps/installation/models.py @@ -74,29 +74,32 @@ class Installation(SingletonModel): def binary_dependencies(self): namespace = PropertyNamespace('bins', _(u'Binary dependencies')) - tesseract = sh.Command(TESSERACT_PATH) try: - namespace.add_property('tesseract', _(u'tesseract version'), tesseract('-v').stderr, report=True) + tesseract = sh.Command(TESSERACT_PATH) except sh.CommandNotFound: namespace.add_property('tesseract', _(u'tesseract version'), _(u'not found'), report=True) except Exception: namespace.add_property('tesseract', _(u'tesseract version'), _(u'error getting version'), report=True) + else: + namespace.add_property('tesseract', _(u'tesseract version'), tesseract('-v').stderr, report=True) - unpaper = sh.Command(UNPAPER_PATH) try: - namespace.add_property('unpaper', _(u'unpaper version'), unpaper('-V').stdout, report=True) + unpaper = sh.Command(UNPAPER_PATH) except sh.CommandNotFound: namespace.add_property('unpaper', _(u'unpaper version'), _(u'not found'), report=True) except Exception: namespace.add_property('unpaper', _(u'unpaper version'), _(u'error getting version'), report=True) + else: + namespace.add_property('unpaper', _(u'unpaper version'), unpaper('-V').stdout, report=True) - pdftotext = sh.Command(PDFTOTEXT_PATH) try: - namespace.add_property('pdftotext', _(u'pdftotext version'), pdftotext('-v').stderr, report=True) + pdftotext = sh.Command(PDFTOTEXT_PATH) except sh.CommandNotFound: namespace.add_property('pdftotext', _(u'pdftotext version'), _(u'not found'), report=True) except Exception: namespace.add_property('pdftotext', _(u'pdftotext version'), _(u'error getting version'), report=True) + else: + namespace.add_property('pdftotext', _(u'pdftotext version'), pdftotext('-v').stderr, report=True) def mayan_properties(self): namespace = PropertyNamespace('mayan', _(u'Mayan EDMS')) From f434720617a7e8acb12b8f4a0a4427c2941a4224 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 30 Sep 2014 15:39:36 -0400 Subject: [PATCH 3/4] Don't silence OCR errors at the API high level, at this layer we don't know what really happened in the backend. Move exception handling to the backend. --- mayan/apps/ocr/api.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index a97e806e8a..195e65994e 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -73,9 +73,6 @@ def do_document_ocr(queue_document): document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u'Text from OCR') document_page.save() - except Exception as e: - logger.debug('missing ocr backend: %s' % ocr_backend) - logger.debug('I/O error({0}): {1}'.format(e.errno, e.strerror)) finally: fs_cleanup(pre_ocr_filepath_w_ext) fs_cleanup(unpaper_input) From 4968051b6dab4b9b2347633cb1b02cc64bd5f8c0 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 30 Sep 2014 15:41:00 -0400 Subject: [PATCH 4/4] Don't silence OCR errors even if Tesseract is optional otherwise the user won't know happened. Catch the OSError generic exception and return a friendlier "Tesseract not found" exception --- mayan/apps/ocr/backends/tesseract.py | 38 +++++++++++++++++----------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py index ba679c9b02..51cef200fb 100644 --- a/mayan/apps/ocr/backends/tesseract.py +++ b/mayan/apps/ocr/backends/tesseract.py @@ -1,6 +1,7 @@ from __future__ import absolute_import import codecs +import errno import os import subprocess import tempfile @@ -25,23 +26,30 @@ class Tesseract(BackendBase): if language is not None: command.extend([u'-l', language]) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - error_text = proc.stderr.read() - fs_cleanup(filepath) - fs_cleanup(ocr_output) - if language: - # If tesseract gives an error with a language parameter - # re-run it with no parameter again - return self.execute(input_filename, language=None) + try: + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + except OSError as exception: + if exception.errno == errno.ENOENT: + raise OCRError('Tesseract not found at %s' % TESSERACT_PATH) else: - raise OCRError(error_text) + raise + else: + return_code = proc.wait() + if return_code != 0: + error_text = proc.stderr.read() + fs_cleanup(filepath) + fs_cleanup(ocr_output) + if language: + # If tesseract gives an error with a language parameter + # re-run it with no parameter again + return self.execute(input_filename, language=None) + else: + raise OCRError(error_text) - fd = codecs.open(ocr_output, 'r', 'utf-8') - text = fd.read().strip() - fd.close() + fd = codecs.open(ocr_output, 'r', 'utf-8') + text = fd.read().strip() + fd.close() - os.unlink(filepath) + os.unlink(filepath) return text