Files
mayan-edms/mayan/apps/ocr/backends/tesseract.py
2014-09-30 15:48:42 -04:00

56 lines
1.7 KiB
Python

from __future__ import absolute_import
import codecs
import errno
import os
import subprocess
import tempfile
from common.utils import fs_cleanup
from . import BackendBase
from ..exceptions import OCRError
from ..settings import TESSERACT_PATH
class Tesseract(BackendBase):
def execute(self, input_filename, language=None):
"""
Execute the command line binary of tesseract
"""
fd, filepath = tempfile.mkstemp()
os.close(fd)
ocr_output = os.extsep.join([filepath, u'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if language is not None:
command.extend([u'-l', language])
try:
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
except OSError as exception:
if exception.errno == errno.ENOENT:
raise OCRError('Tesseract not found at %s' % TESSERACT_PATH)
else:
raise
else:
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
fs_cleanup(filepath)
fs_cleanup(ocr_output)
if language:
# If tesseract gives an error with a language parameter
# re-run it with no parameter again
return self.execute(input_filename, language=None)
else:
raise OCRError(error_text)
fd = codecs.open(ocr_output, 'r', 'utf-8')
text = fd.read().strip()
fd.close()
os.unlink(filepath)
return text