Re enabled tesseract language specific OCR processing and added a 1 time language neutral retry for failed language specific OCR
This commit is contained in:
@@ -57,9 +57,8 @@ def run_tesseract(input_filename, lang=None):
|
||||
ocr_output = os.extsep.join([filepath, u'txt'])
|
||||
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
|
||||
|
||||
# TODO: Tesseract 3.0 segfaults
|
||||
#if lang is not None:
|
||||
# command.extend([u'-l', lang])
|
||||
if lang is not None:
|
||||
command.extend([u'-l', lang])
|
||||
|
||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||
return_code = proc.wait()
|
||||
@@ -67,6 +66,11 @@ def run_tesseract(input_filename, lang=None):
|
||||
error_text = proc.stderr.read()
|
||||
cleanup(filepath)
|
||||
cleanup(ocr_output)
|
||||
if lang:
|
||||
# If tesseract gives an error with a language parameter
|
||||
# re-run it with no parameter again
|
||||
return run_tesseract(input_filename, lang=None)
|
||||
else:
|
||||
raise TesseractError(error_text)
|
||||
|
||||
fd = codecs.open(ocr_output, 'r', 'utf-8')
|
||||
|
||||
Reference in New Issue
Block a user