64 lines
1.9 KiB
Python
64 lines
1.9 KiB
Python
from __future__ import absolute_import, unicode_literals
|
|
|
|
import logging
|
|
|
|
from PIL import Image
|
|
import pyocr
|
|
import pyocr.builders
|
|
|
|
from ..classes import OCRBackendBase
|
|
from ..exceptions import OCRError
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PyOCR(OCRBackendBase):
|
|
def __init__(self, *args, **kwargs):
|
|
super(PyOCR, self).__init__(*args, **kwargs)
|
|
|
|
self.languages = ()
|
|
|
|
tools = pyocr.get_available_tools()
|
|
if len(tools) == 0:
|
|
raise OCRError('No OCR tool found')
|
|
|
|
self.tool = tools[0]
|
|
|
|
# The tools are returned in the recommended order of usage
|
|
for tool in tools:
|
|
if tool.__name__ == 'pyocr.libtesseract':
|
|
self.tool = tool
|
|
|
|
logger.debug('Will use tool \'%s\'', self.tool.get_name())
|
|
|
|
self.languages = self.tool.get_available_languages()
|
|
logger.debug('Available languages: %s', ', '.join(self.languages))
|
|
|
|
def execute(self, *args, **kwargs):
|
|
"""
|
|
Execute the command line binary of tesseract
|
|
"""
|
|
super(PyOCR, self).execute(*args, **kwargs)
|
|
|
|
image = Image.open(self.converter.get_page())
|
|
try:
|
|
result = self.tool.image_to_string(
|
|
image,
|
|
lang=self.language,
|
|
builder=pyocr.builders.TextBuilder()
|
|
)
|
|
except Exception as exception:
|
|
error_message = ('Exception calling pyocr with language option: '
|
|
'{}; {}').format(self.language, exception)
|
|
|
|
if self.language not in self.languages:
|
|
error_message = ('{}\nThe requested OCR language "{}" is not '
|
|
'available and needs to be installed.\n').format(
|
|
error_message, self.language
|
|
)
|
|
|
|
logger.error(error_message)
|
|
raise OCRError(error_message)
|
|
else:
|
|
return result
|