Re enabled tesseract language specific OCR processing and added a 1 time language neutral retry for failed language specific OCR

This commit is contained in:
Roberto Rosario
2011-11-22 17:46:18 -04:00
parent 667af2a442
commit deb09d3d83

View File

@@ -57,9 +57,8 @@ def run_tesseract(input_filename, lang=None):
ocr_output = os.extsep.join([filepath, u'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
# TODO: Tesseract 3.0 segfaults
#if lang is not None:
# command.extend([u'-l', lang])
if lang is not None:
command.extend([u'-l', lang])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
@@ -67,6 +66,11 @@ def run_tesseract(input_filename, lang=None):
error_text = proc.stderr.read()
cleanup(filepath)
cleanup(ocr_output)
if lang:
# If tesseract gives an error with a language parameter
# re-run it with no parameter again
return run_tesseract(input_filename, lang=None)
else:
raise TesseractError(error_text)
fd = codecs.open(ocr_output, 'r', 'utf-8')