Files
mayan-edms/mayan/apps/ocr/backends/tesseract.py
Roberto Rosario 8712c6ee37 PEP8 cleanups.
2016-05-17 05:08:21 -04:00

60 lines
1.9 KiB
Python

from __future__ import unicode_literals
import logging
import sh
from PIL import Image
import pytesseract
from ..classes import OCRBackendBase
from ..exceptions import OCRError
from ..settings import setting_tesseract_path
logger = logging.getLogger(__name__)
class Tesseract(OCRBackendBase):
def __init__(self, *args, **kwargs):
super(Tesseract, self).__init__(*args, **kwargs)
try:
self.binary = sh.Command(setting_tesseract_path.value)
except sh.CommandNotFound:
self.binary = None
def get_languages(self):
if self.binary:
result = self.binary(list_langs=True)
return [
language for language in result.stderr.split('\n') if language
]
else:
return ()
def execute(self, *args, **kwargs):
"""
Execute the command line binary of tesseract
"""
super(Tesseract, self).execute(*args, **kwargs)
# TODO: pass tesseract binary path to the pytesseract
image = Image.open(self.converter.get_page())
try:
result = pytesseract.image_to_string(
image=image, lang=self.language
)
# If tesseract gives an error with a language parameter
# re-run it with no language parameter
except Exception as exception:
error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception)
if self.binary:
if self.language not in self.get_languages():
error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language)
logger.error(error_message)
raise OCRError(error_message)
return result