Improved tesseract execution handling

2011-02-17 23:31:54 -04:00
parent 5e5fee7330
commit c18cb099c6
1 changed files with 21 additions and 29 deletions
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -1,5 +1,6 @@
 #Some code from http://wiki.github.com/hoffstaetter/python-tesseract

+import codecs
 import os
 import subprocess
 import tempfile
@@ -14,6 +15,11 @@ from documents.models import Document
 from converter.api import convert_document_for_ocr

 from ocr.conf.settings import TESSERACT_PATH
+from ocr.conf.settings import TESSERACT_LANGUAGE
+
+
+class TesseractError(Exception):
+    pass


 def cleanup(filename):
@@ -23,29 +29,17 @@ def cleanup(filename):
    except OSError:
        pass

-class TesseractError(Exception):
-    pass
-#    def __init__(self, status, message):
-#        self.status = status
-#        self.message = message
-
-def get_errors(error_string):
-    '''
-    returns all lines in the error_string that start with the string "error"
-
-    '''
-    lines = error_string.splitlines()
-    return lines[1]
-    #error_lines = (line for line in lines if line.find('error') >= 0)
-    #return '\n'.join(error_lines)

 def run_tesseract(input_filename, output_filename_base, lang=None):
    command = [TESSERACT_PATH, input_filename, output_filename_base]
    if lang is not None:
        command += ['-l', lang]
    
-    proc = subprocess.Popen(command, stderr=subprocess.PIPE)
-    return (proc.wait(), proc.stderr.read())
+    proc = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
+    return_code = proc.wait()
+    if return_code != 0:
+        error_text = proc.stderr.read()
+        raise TesseractError(error_text)


 def do_document_ocr(document):
@@ -53,21 +47,19 @@ def do_document_ocr(document):
        imagefile = convert_document_for_ocr(document, page=page_index)
        desc, filepath = tempfile.mkstemp()
        try:
-            status, error_string = run_tesseract(imagefile, filepath)
-            if status:
-                errors = get_errors(error_string)
-                raise TesseractError(errors)
-        finally:
+            run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
            ocr_output = os.extsep.join([filepath, 'txt'])
-
-        f = file(ocr_output)
-        try:
+            f = codecs.open(ocr_output, 'r', 'utf-8')
            document_page = document.documentpage_set.get(page_number=page_index+1)
            document_page.content = f.read().strip()
            document_page.page_label = _(u'Text from OCR')
            document_page.save()
-        finally:
            f.close()
-            cleanup(filepath)
            cleanup(ocr_output)
+        except TesseractError, e:
+            cleanup(filepath)
+            cleanup(imagefile)
+            raise TesseractError(e)
+        finally:
+            cleanup(filepath)
            cleanup(imagefile)