from __future__ import unicode_literals import logging import sh from PIL import Image import pytesseract from ..classes import OCRBackendBase from ..exceptions import OCRError from ..settings import setting_tesseract_path logger = logging.getLogger(__name__) class Tesseract(OCRBackendBase): def __init__(self, *args, **kwargs): super(Tesseract, self).__init__(*args, **kwargs) try: self.binary = sh.Command(setting_tesseract_path.value) except sh.CommandNotFound: self.binary = None def get_languages(self): if self.binary: result = self.binary(list_langs=True) return [ language for language in result.stderr.split('\n') if language ] else: return () def execute(self, *args, **kwargs): """ Execute the command line binary of tesseract """ super(Tesseract, self).execute(*args, **kwargs) # TODO: pass tesseract binary path to the pytesseract image = Image.open(self.converter.get_page()) try: result = pytesseract.image_to_string( image=image, lang=self.language ) # If tesseract gives an error with a language parameter # re-run it with no language parameter except Exception as exception: error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception) if self.binary: if self.language not in self.get_languages(): error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language) logger.error(error_message) raise OCRError(error_message) return result