Add new OCR backend using PyOCR. Remove current direct call Tesseract backend.

This commit is contained in:
Roberto Rosario
2016-12-30 00:36:45 -04:00
parent 5b94b419e9
commit 6bfdb053e3
6 changed files with 72 additions and 66 deletions

View File

@@ -35,10 +35,15 @@ on production install to debug errors live.
- Refactor the remove document from folder view to allow removing documents from multiple folders at the same time.
- Refactor the document mailing views and add support for sending multiple documents via email at the same time.
- Refactor the document metadata views and add support for adding multiple metadata types to a document at the same time.
- Addition of a new OCR backend using PyOCR. This backend tries first to do OCR
using libtesseract. If libtesseract is not available the backend fallsback to
calling the Tesseract executable.
Removals
--------
* None
- Removal of the OCR_TESSERACT_PATH configuration setting.
- Removal of the Tesseract OCR backend. Replaced with a PyOCR backend.
- Remove usage of pytesseract Python library.
Upgrading from a previous version
---------------------------------

View File

@@ -0,0 +1,62 @@
from __future__ import absolute_import, unicode_literals
import logging
from PIL import Image
import pyocr
import pyocr.builders
from ..classes import OCRBackendBase
from ..exceptions import OCRError
logger = logging.getLogger(__name__)
class PyOCR(OCRBackendBase):
def __init__(self, *args, **kwargs):
super(PyOCR, self).__init__(*args, **kwargs)
tools = pyocr.get_available_tools()
if len(tools) == 0:
raise OCRError('No OCR tool found')
# The tools are returned in the recommended order of usage
for tool in tools:
if tool.__name__ == 'pyocr.libtesseract':
self.tool = tool
if not self.tool:
self.tool = tools[0]
logger.debug('Will use tool \'%s\'', self.tool.get_name())
self.languages = self.tool.get_available_languages()
logger.debug('Available languages: %s', ', '.join(self.languages))
def execute(self, *args, **kwargs):
"""
Execute the command line binary of tesseract
"""
super(PyOCR, self).execute(*args, **kwargs)
image = Image.open(self.converter.get_page())
try:
result = self.tool.image_to_string(
image,
lang=self.language,
builder=pyocr.builders.TextBuilder()
)
except Exception as exception:
error_message = 'Exception calling pyocr with language option: '
'{}; {}'.format(self.language, exception)
if self.language not in self.languages:
error_message = '{}\nThe requested OCR language "{}" is not '
'available and needs to be installed.\n'.format(
error_message, self.language
)
logger.error(error_message)
raise OCRError(error_message)
else:
return result

View File

@@ -1,59 +0,0 @@
from __future__ import unicode_literals
import logging
import sh
from PIL import Image
import pytesseract
from ..classes import OCRBackendBase
from ..exceptions import OCRError
from ..settings import setting_tesseract_path
logger = logging.getLogger(__name__)
class Tesseract(OCRBackendBase):
def __init__(self, *args, **kwargs):
super(Tesseract, self).__init__(*args, **kwargs)
try:
self.binary = sh.Command(setting_tesseract_path.value)
except sh.CommandNotFound:
self.binary = None
def get_languages(self):
if self.binary:
result = self.binary(list_langs=True)
return [
language for language in result.stderr.split('\n') if language
]
else:
return ()
def execute(self, *args, **kwargs):
"""
Execute the command line binary of tesseract
"""
super(Tesseract, self).execute(*args, **kwargs)
# TODO: pass tesseract binary path to the pytesseract
image = Image.open(self.converter.get_page())
try:
result = pytesseract.image_to_string(
image=image, lang=self.language
)
# If tesseract gives an error with a language parameter
# re-run it with no language parameter
except Exception as exception:
error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception)
if self.binary:
if self.language not in self.get_languages():
error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language)
logger.error(error_message)
raise OCRError(error_message)
return result

View File

@@ -5,10 +5,7 @@ from django.utils.translation import ugettext_lazy as _
from smart_settings import Namespace
namespace = Namespace(name='ocr', label=_('OCR'))
setting_tesseract_path = namespace.add_setting(
global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract',
help_text=_('File path to tesseract program.'), is_path=True
)
setting_pdftotext_path = namespace.add_setting(
global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext',
help_text=_(
@@ -18,7 +15,7 @@ setting_pdftotext_path = namespace.add_setting(
is_path=True
)
setting_ocr_backend = namespace.add_setting(
global_name='OCR_BACKEND', default='ocr.backends.tesseract.Tesseract',
global_name='OCR_BACKEND', default='ocr.backends.pyocr.PyOCR',
help_text=_('Full path to the backend to be used to do OCR.')
)
setting_auto_ocr = namespace.add_setting(

View File

@@ -1,2 +1,3 @@
# Packages to be remove during upgrades
django-filetransfers
pytesseract

View File

@@ -27,7 +27,7 @@ fusepy==2.0.4
pdfminer==20140328
pycountry==1.20
pytesseract==0.1.6
pyocr==0.4.4
python-dateutil==2.5.3
python-gnupg==0.3.9
python-magic==0.4.12