Refactor OCR backend class to be file object based and use images from document page not the actual file. Use pytesseract instead of calling the CLI directly.

2015-06-09 03:27:02 -04:00
parent 931bdfd113
commit 5275061f9f
6 changed files with 81 additions and 45 deletions
--- a/mayan/apps/ocr/backends/tesseract.py
+++ b/mayan/apps/ocr/backends/tesseract.py
@@ -1,55 +1,41 @@
 from __future__ import unicode_literals

+try:
+    from cStringIO import StringIO
+except ImportError:
+    from StringIO import StringIO
+
 import codecs
 import errno
+import logging
 import os
-import subprocess
 import tempfile

+from PIL import Image, ImageFilter
+import pytesseract
+
 from common.utils import fs_cleanup

-from . import BackendBase
+from ..classes import OCRBackendBase
 from ..exceptions import OCRError
 from ..settings import TESSERACT_PATH

+logger = logging.getLogger(__name__)

-class Tesseract(BackendBase):
-    def execute(self, input_filename, language=None):
+
+class Tesseract(OCRBackendBase):
+    def execute(self, *args, **kwargs):
        """
        Execute the command line binary of tesseract
        """
-        fd, filepath = tempfile.mkstemp()
-        os.close(fd)
-        ocr_output = os.extsep.join([filepath, 'txt'])
-        command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
-
-        if language is not None:
-            command.extend(['-l', language])
+        super(Tesseract, self).execute(*args, **kwargs)

+        image = Image.open(self.converter.get_page())
        try:
-            proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
-        except OSError as exception:
-            if exception.errno == errno.ENOENT:
-                raise OCRError('Tesseract not found at %s' % TESSERACT_PATH)
-            else:
-                raise
-        else:
-            return_code = proc.wait()
-            if return_code != 0:
-                error_text = proc.stderr.read()
-                fs_cleanup(filepath)
-                fs_cleanup(ocr_output)
-                if language:
-                    # If tesseract gives an error with a language parameter
-                    # re-run it with no parameter again
-                    return self.execute(input_filename, language=None)
-                else:
-                    raise OCRError(error_text)
+            result = pytesseract.image_to_string(image=image, lang=self.language)
+            # If tesseract gives an error with a language parameter
+            # re-run it with no language parameter
+        except:
+            result = pytesseract.image_to_string(image=image)

-            fd = codecs.open(ocr_output, 'r', 'utf-8')
-            text = fd.read().strip()
-            fd.close()
-
-            os.unlink(filepath)
-
-        return text
+        return result