From deb09d3d8394299ec68ca22d1e8552942ea7d69d Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 22 Nov 2011 17:46:18 -0400 Subject: [PATCH] Re enabled tesseract language specific OCR processing and added a 1 time language neutral retry for failed language specific OCR --- apps/ocr/api.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index d9d7782b1d..4d70443f92 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -57,9 +57,8 @@ def run_tesseract(input_filename, lang=None): ocr_output = os.extsep.join([filepath, u'txt']) command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] - # TODO: Tesseract 3.0 segfaults - #if lang is not None: - # command.extend([u'-l', lang]) + if lang is not None: + command.extend([u'-l', lang]) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() @@ -67,7 +66,12 @@ def run_tesseract(input_filename, lang=None): error_text = proc.stderr.read() cleanup(filepath) cleanup(ocr_output) - raise TesseractError(error_text) + if lang: + # If tesseract gives an error with a language parameter + # re-run it with no parameter again + return run_tesseract(input_filename, lang=None) + else: + raise TesseractError(error_text) fd = codecs.open(ocr_output, 'r', 'utf-8') text = fd.read().strip()