56 lines
1.7 KiB
Python
56 lines
1.7 KiB
Python
from __future__ import absolute_import
|
|
|
|
import codecs
|
|
import errno
|
|
import os
|
|
import subprocess
|
|
import tempfile
|
|
|
|
from common.utils import fs_cleanup
|
|
|
|
from . import BackendBase
|
|
from ..exceptions import OCRError
|
|
from ..settings import TESSERACT_PATH
|
|
|
|
|
|
class Tesseract(BackendBase):
|
|
def execute(self, input_filename, language=None):
|
|
"""
|
|
Execute the command line binary of tesseract
|
|
"""
|
|
fd, filepath = tempfile.mkstemp()
|
|
os.close(fd)
|
|
ocr_output = os.extsep.join([filepath, u'txt'])
|
|
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
|
|
|
|
if language is not None:
|
|
command.extend([u'-l', language])
|
|
|
|
try:
|
|
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
|
except OSError as exception:
|
|
if exception.errno == errno.ENOENT:
|
|
raise OCRError('Tesseract not found at %s' % TESSERACT_PATH)
|
|
else:
|
|
raise
|
|
else:
|
|
return_code = proc.wait()
|
|
if return_code != 0:
|
|
error_text = proc.stderr.read()
|
|
fs_cleanup(filepath)
|
|
fs_cleanup(ocr_output)
|
|
if language:
|
|
# If tesseract gives an error with a language parameter
|
|
# re-run it with no parameter again
|
|
return self.execute(input_filename, language=None)
|
|
else:
|
|
raise OCRError(error_text)
|
|
|
|
fd = codecs.open(ocr_output, 'r', 'utf-8')
|
|
text = fd.read().strip()
|
|
fd.close()
|
|
|
|
os.unlink(filepath)
|
|
|
|
return text
|