From 1cbd54dad799ee3c95d413826d314a80f2371f8d Mon Sep 17 00:00:00 2001
From: Mathias Behrle <mathiasb@m9s.biz>
Date: Thu, 25 Sep 2014 19:31:46 +0200
Subject: [PATCH 1/4] Do not fail, when ocr backend is missing (#50).

- Since tesseract (i.e. an ocr backend) is an optional requiremnt of mayan, we
  shouldn't fail, if it is not found.
- S.a https://github.com/mayan-edms/mayan-edms/issues/50
---
 mayan/apps/ocr/api.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py
index 195e65994e..a97e806e8a 100644
--- a/mayan/apps/ocr/api.py
+++ b/mayan/apps/ocr/api.py
@@ -73,6 +73,9 @@ def do_document_ocr(queue_document):
                 document_page.content = ocr_cleanup(ocr_text)
                 document_page.page_label = _(u'Text from OCR')
                 document_page.save()
+            except Exception as e:
+                logger.debug('missing ocr backend: %s' % ocr_backend)
+                logger.debug('I/O error({0}): {1}'.format(e.errno, e.strerror))
             finally:
                 fs_cleanup(pre_ocr_filepath_w_ext)
                 fs_cleanup(unpaper_input)

From 3effc58ab083e9ee91d44fc153eb0d4672906441 Mon Sep 17 00:00:00 2001
From: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
Date: Tue, 30 Sep 2014 15:21:21 -0400
Subject: [PATCH 2/4] Fix the way we try for missing binary dependencies in the
 installations app Issue #50. Thanks to Ford Guo (@fordguo) for the reporting.

---
 mayan/apps/installation/models.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/mayan/apps/installation/models.py b/mayan/apps/installation/models.py
index b04ca35d24..9b8fa6f7ad 100644
--- a/mayan/apps/installation/models.py
+++ b/mayan/apps/installation/models.py
@@ -74,29 +74,32 @@ class Installation(SingletonModel):
     def binary_dependencies(self):
         namespace = PropertyNamespace('bins', _(u'Binary dependencies'))
 
-        tesseract = sh.Command(TESSERACT_PATH)
         try:
-            namespace.add_property('tesseract', _(u'tesseract version'), tesseract('-v').stderr, report=True)
+            tesseract = sh.Command(TESSERACT_PATH)
         except sh.CommandNotFound:
             namespace.add_property('tesseract', _(u'tesseract version'), _(u'not found'), report=True)
         except Exception:
             namespace.add_property('tesseract', _(u'tesseract version'), _(u'error getting version'), report=True)
+        else:
+            namespace.add_property('tesseract', _(u'tesseract version'), tesseract('-v').stderr, report=True)
 
-        unpaper = sh.Command(UNPAPER_PATH)
         try:
-            namespace.add_property('unpaper', _(u'unpaper version'), unpaper('-V').stdout, report=True)
+            unpaper = sh.Command(UNPAPER_PATH)
         except sh.CommandNotFound:
             namespace.add_property('unpaper', _(u'unpaper version'), _(u'not found'), report=True)
         except Exception:
             namespace.add_property('unpaper', _(u'unpaper version'), _(u'error getting version'), report=True)
+        else:
+            namespace.add_property('unpaper', _(u'unpaper version'), unpaper('-V').stdout, report=True)
 
-        pdftotext = sh.Command(PDFTOTEXT_PATH)
         try:
-            namespace.add_property('pdftotext', _(u'pdftotext version'), pdftotext('-v').stderr, report=True)
+            pdftotext = sh.Command(PDFTOTEXT_PATH)
         except sh.CommandNotFound:
             namespace.add_property('pdftotext', _(u'pdftotext version'), _(u'not found'), report=True)
         except Exception:
             namespace.add_property('pdftotext', _(u'pdftotext version'), _(u'error getting version'), report=True)
+        else:
+            namespace.add_property('pdftotext', _(u'pdftotext version'), pdftotext('-v').stderr, report=True)
 
     def mayan_properties(self):
         namespace = PropertyNamespace('mayan', _(u'Mayan EDMS'))

From f434720617a7e8acb12b8f4a0a4427c2941a4224 Mon Sep 17 00:00:00 2001
From: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
Date: Tue, 30 Sep 2014 15:39:36 -0400
Subject: [PATCH 3/4] Don't silence OCR errors at the API high level, at this
 layer we don't know what really happened in the backend. Move exception
 handling to the backend.

---
 mayan/apps/ocr/api.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py
index a97e806e8a..195e65994e 100644
--- a/mayan/apps/ocr/api.py
+++ b/mayan/apps/ocr/api.py
@@ -73,9 +73,6 @@ def do_document_ocr(queue_document):
                 document_page.content = ocr_cleanup(ocr_text)
                 document_page.page_label = _(u'Text from OCR')
                 document_page.save()
-            except Exception as e:
-                logger.debug('missing ocr backend: %s' % ocr_backend)
-                logger.debug('I/O error({0}): {1}'.format(e.errno, e.strerror))
             finally:
                 fs_cleanup(pre_ocr_filepath_w_ext)
                 fs_cleanup(unpaper_input)

From 4968051b6dab4b9b2347633cb1b02cc64bd5f8c0 Mon Sep 17 00:00:00 2001
From: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
Date: Tue, 30 Sep 2014 15:41:00 -0400
Subject: [PATCH 4/4] Don't silence OCR errors even if Tesseract is optional
 otherwise the user won't know happened. Catch the OSError generic exception
 and return a friendlier "Tesseract not found" exception

---
 mayan/apps/ocr/backends/tesseract.py | 38 +++++++++++++++++-----------
 1 file changed, 23 insertions(+), 15 deletions(-)

diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py
index ba679c9b02..51cef200fb 100644
--- a/mayan/apps/ocr/backends/tesseract.py
+++ b/mayan/apps/ocr/backends/tesseract.py
@@ -1,6 +1,7 @@
 from __future__ import absolute_import
 
 import codecs
+import errno
 import os
 import subprocess
 import tempfile
@@ -25,23 +26,30 @@ class Tesseract(BackendBase):
         if language is not None:
             command.extend([u'-l', language])
 
-        proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-        return_code = proc.wait()
-        if return_code != 0:
-            error_text = proc.stderr.read()
-            fs_cleanup(filepath)
-            fs_cleanup(ocr_output)
-            if language:
-                # If tesseract gives an error with a language parameter
-                # re-run it with no parameter again
-                return self.execute(input_filename, language=None)
+        try:
+            proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+        except OSError as exception:
+            if exception.errno == errno.ENOENT:
+                raise OCRError('Tesseract not found at %s' % TESSERACT_PATH)
             else:
-                raise OCRError(error_text)
+                raise
+        else:
+            return_code = proc.wait()
+            if return_code != 0:
+                error_text = proc.stderr.read()
+                fs_cleanup(filepath)
+                fs_cleanup(ocr_output)
+                if language:
+                    # If tesseract gives an error with a language parameter
+                    # re-run it with no parameter again
+                    return self.execute(input_filename, language=None)
+                else:
+                    raise OCRError(error_text)
 
-        fd = codecs.open(ocr_output, 'r', 'utf-8')
-        text = fd.read().strip()
-        fd.close()
+            fd = codecs.open(ocr_output, 'r', 'utf-8')
+            text = fd.read().strip()
+            fd.close()
 
-        os.unlink(filepath)
+            os.unlink(filepath)
 
         return text