Add new OCR backend using PyOCR. Remove current direct call Tesseract backend.

2016-12-30 00:36:45 -04:00
parent 5b94b419e9
commit 6bfdb053e3
6 changed files with 72 additions and 66 deletions
--- a/docs/releases/2.2.rst
+++ b/docs/releases/2.2.rst
@@ -35,10 +35,15 @@ on production install to debug errors live.
 - Refactor the remove document from folder view to allow removing documents from multiple folders at the same time.
 - Refactor the document mailing views and add support for sending multiple documents via email at the same time.
 - Refactor the document metadata views and add support for adding multiple metadata types to a document at the same time.
+- Addition of a new OCR backend using PyOCR. This backend tries first to do OCR
+using libtesseract. If libtesseract is not available the backend fallsback to
+calling the Tesseract executable.

 Removals
 --------
-* None
+- Removal of the OCR_TESSERACT_PATH configuration setting.
+- Removal of the Tesseract OCR backend. Replaced with a PyOCR backend.
+- Remove usage of pytesseract Python library.

 Upgrading from a previous version
 ---------------------------------
--- a/mayan/apps/ocr/backends/pyocr.py
+++ b/mayan/apps/ocr/backends/pyocr.py
@@ -0,0 +1,62 @@
+from __future__ import absolute_import, unicode_literals
+
+import logging
+
+from PIL import Image
+import pyocr
+import pyocr.builders
+
+from ..classes import OCRBackendBase
+from ..exceptions import OCRError
+
+logger = logging.getLogger(__name__)
+
+
+class PyOCR(OCRBackendBase):
+    def __init__(self, *args, **kwargs):
+        super(PyOCR, self).__init__(*args, **kwargs)
+
+        tools = pyocr.get_available_tools()
+        if len(tools) == 0:
+            raise OCRError('No OCR tool found')
+
+        # The tools are returned in the recommended order of usage
+        for tool in tools:
+            if tool.__name__ == 'pyocr.libtesseract':
+                self.tool = tool
+
+        if not self.tool:
+            self.tool = tools[0]
+
+        logger.debug('Will use tool \'%s\'', self.tool.get_name())
+
+        self.languages = self.tool.get_available_languages()
+        logger.debug('Available languages: %s',  ', '.join(self.languages))
+
+    def execute(self, *args, **kwargs):
+        """
+        Execute the command line binary of tesseract
+        """
+        super(PyOCR, self).execute(*args, **kwargs)
+
+        image = Image.open(self.converter.get_page())
+        try:
+            result = self.tool.image_to_string(
+                image,
+                lang=self.language,
+                builder=pyocr.builders.TextBuilder()
+            )
+        except Exception as exception:
+            error_message = 'Exception calling pyocr with language option: '
+            '{}; {}'.format(self.language, exception)
+
+            if self.language not in self.languages:
+                error_message = '{}\nThe requested OCR language "{}" is not '
+                'available and needs to be installed.\n'.format(
+                    error_message, self.language
+                )
+
+            logger.error(error_message)
+            raise OCRError(error_message)
+        else:
+            return result
--- a/mayan/apps/ocr/backends/tesseract.py
+++ b/mayan/apps/ocr/backends/tesseract.py
@@ -1,59 +0,0 @@
-from __future__ import unicode_literals
-
-import logging
-
-import sh
-
-from PIL import Image
-import pytesseract
-
-from ..classes import OCRBackendBase
-from ..exceptions import OCRError
-from ..settings import setting_tesseract_path
-
-logger = logging.getLogger(__name__)
-
-
-class Tesseract(OCRBackendBase):
-    def __init__(self, *args, **kwargs):
-        super(Tesseract, self).__init__(*args, **kwargs)
-        try:
-            self.binary = sh.Command(setting_tesseract_path.value)
-        except sh.CommandNotFound:
-            self.binary = None
-
-    def get_languages(self):
-        if self.binary:
-            result = self.binary(list_langs=True)
-
-            return [
-                language for language in result.stderr.split('\n') if language
-            ]
-        else:
-            return ()
-
-    def execute(self, *args, **kwargs):
-        """
-        Execute the command line binary of tesseract
-        """
-        super(Tesseract, self).execute(*args, **kwargs)
-
-        # TODO: pass tesseract binary path to the pytesseract
-        image = Image.open(self.converter.get_page())
-        try:
-            result = pytesseract.image_to_string(
-                image=image, lang=self.language
-            )
-            # If tesseract gives an error with a language parameter
-            # re-run it with no language parameter
-        except Exception as exception:
-            error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception)
-
-            if self.binary:
-                if self.language not in self.get_languages():
-                    error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language)
-
-            logger.error(error_message)
-            raise OCRError(error_message)
-
-        return result
--- a/mayan/apps/ocr/settings.py
+++ b/mayan/apps/ocr/settings.py
@@ -5,10 +5,7 @@ from django.utils.translation import ugettext_lazy as _
 from smart_settings import Namespace

 namespace = Namespace(name='ocr', label=_('OCR'))
-setting_tesseract_path = namespace.add_setting(
-    global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract',
-    help_text=_('File path to tesseract program.'), is_path=True
-)
+
 setting_pdftotext_path = namespace.add_setting(
    global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext',
    help_text=_(
@@ -18,7 +15,7 @@ setting_pdftotext_path = namespace.add_setting(
    is_path=True
 )
 setting_ocr_backend = namespace.add_setting(
-    global_name='OCR_BACKEND', default='ocr.backends.tesseract.Tesseract',
+    global_name='OCR_BACKEND', default='ocr.backends.pyocr.PyOCR',
    help_text=_('Full path to the backend to be used to do OCR.')
 )
 setting_auto_ocr = namespace.add_setting(
--- a/removals.txt
+++ b/removals.txt
@@ -1,2 +1,3 @@
 # Packages to be remove during upgrades
 django-filetransfers
+pytesseract
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -27,7 +27,7 @@ fusepy==2.0.4

 pdfminer==20140328
 pycountry==1.20
-pytesseract==0.1.6
+pyocr==0.4.4
 python-dateutil==2.5.3
 python-gnupg==0.3.9
 python-magic==0.4.12