Re enabled tesseract language specific OCR processing and added a 1 time language neutral retry for failed language specific OCR
This commit is contained in:
@@ -57,9 +57,8 @@ def run_tesseract(input_filename, lang=None):
|
|||||||
ocr_output = os.extsep.join([filepath, u'txt'])
|
ocr_output = os.extsep.join([filepath, u'txt'])
|
||||||
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
|
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
|
||||||
|
|
||||||
# TODO: Tesseract 3.0 segfaults
|
if lang is not None:
|
||||||
#if lang is not None:
|
command.extend([u'-l', lang])
|
||||||
# command.extend([u'-l', lang])
|
|
||||||
|
|
||||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||||
return_code = proc.wait()
|
return_code = proc.wait()
|
||||||
@@ -67,7 +66,12 @@ def run_tesseract(input_filename, lang=None):
|
|||||||
error_text = proc.stderr.read()
|
error_text = proc.stderr.read()
|
||||||
cleanup(filepath)
|
cleanup(filepath)
|
||||||
cleanup(ocr_output)
|
cleanup(ocr_output)
|
||||||
raise TesseractError(error_text)
|
if lang:
|
||||||
|
# If tesseract gives an error with a language parameter
|
||||||
|
# re-run it with no parameter again
|
||||||
|
return run_tesseract(input_filename, lang=None)
|
||||||
|
else:
|
||||||
|
raise TesseractError(error_text)
|
||||||
|
|
||||||
fd = codecs.open(ocr_output, 'r', 'utf-8')
|
fd = codecs.open(ocr_output, 'r', 'utf-8')
|
||||||
text = fd.read().strip()
|
text = fd.read().strip()
|
||||||
|
|||||||
Reference in New Issue
Block a user