Add new OCR backend using PyOCR. Remove current direct call Tesseract backend.
This commit is contained in:
@@ -35,10 +35,15 @@ on production install to debug errors live.
|
||||
- Refactor the remove document from folder view to allow removing documents from multiple folders at the same time.
|
||||
- Refactor the document mailing views and add support for sending multiple documents via email at the same time.
|
||||
- Refactor the document metadata views and add support for adding multiple metadata types to a document at the same time.
|
||||
- Addition of a new OCR backend using PyOCR. This backend tries first to do OCR
|
||||
using libtesseract. If libtesseract is not available the backend fallsback to
|
||||
calling the Tesseract executable.
|
||||
|
||||
Removals
|
||||
--------
|
||||
* None
|
||||
- Removal of the OCR_TESSERACT_PATH configuration setting.
|
||||
- Removal of the Tesseract OCR backend. Replaced with a PyOCR backend.
|
||||
- Remove usage of pytesseract Python library.
|
||||
|
||||
Upgrading from a previous version
|
||||
---------------------------------
|
||||
|
||||
62
mayan/apps/ocr/backends/pyocr.py
Normal file
62
mayan/apps/ocr/backends/pyocr.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from PIL import Image
|
||||
import pyocr
|
||||
import pyocr.builders
|
||||
|
||||
from ..classes import OCRBackendBase
|
||||
from ..exceptions import OCRError
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class PyOCR(OCRBackendBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(PyOCR, self).__init__(*args, **kwargs)
|
||||
|
||||
tools = pyocr.get_available_tools()
|
||||
if len(tools) == 0:
|
||||
raise OCRError('No OCR tool found')
|
||||
|
||||
# The tools are returned in the recommended order of usage
|
||||
for tool in tools:
|
||||
if tool.__name__ == 'pyocr.libtesseract':
|
||||
self.tool = tool
|
||||
|
||||
if not self.tool:
|
||||
self.tool = tools[0]
|
||||
|
||||
logger.debug('Will use tool \'%s\'', self.tool.get_name())
|
||||
|
||||
self.languages = self.tool.get_available_languages()
|
||||
logger.debug('Available languages: %s', ', '.join(self.languages))
|
||||
|
||||
def execute(self, *args, **kwargs):
|
||||
"""
|
||||
Execute the command line binary of tesseract
|
||||
"""
|
||||
super(PyOCR, self).execute(*args, **kwargs)
|
||||
|
||||
image = Image.open(self.converter.get_page())
|
||||
try:
|
||||
result = self.tool.image_to_string(
|
||||
image,
|
||||
lang=self.language,
|
||||
builder=pyocr.builders.TextBuilder()
|
||||
)
|
||||
except Exception as exception:
|
||||
error_message = 'Exception calling pyocr with language option: '
|
||||
'{}; {}'.format(self.language, exception)
|
||||
|
||||
if self.language not in self.languages:
|
||||
error_message = '{}\nThe requested OCR language "{}" is not '
|
||||
'available and needs to be installed.\n'.format(
|
||||
error_message, self.language
|
||||
)
|
||||
|
||||
logger.error(error_message)
|
||||
raise OCRError(error_message)
|
||||
else:
|
||||
return result
|
||||
@@ -1,59 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
import sh
|
||||
|
||||
from PIL import Image
|
||||
import pytesseract
|
||||
|
||||
from ..classes import OCRBackendBase
|
||||
from ..exceptions import OCRError
|
||||
from ..settings import setting_tesseract_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Tesseract(OCRBackendBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(Tesseract, self).__init__(*args, **kwargs)
|
||||
try:
|
||||
self.binary = sh.Command(setting_tesseract_path.value)
|
||||
except sh.CommandNotFound:
|
||||
self.binary = None
|
||||
|
||||
def get_languages(self):
|
||||
if self.binary:
|
||||
result = self.binary(list_langs=True)
|
||||
|
||||
return [
|
||||
language for language in result.stderr.split('\n') if language
|
||||
]
|
||||
else:
|
||||
return ()
|
||||
|
||||
def execute(self, *args, **kwargs):
|
||||
"""
|
||||
Execute the command line binary of tesseract
|
||||
"""
|
||||
super(Tesseract, self).execute(*args, **kwargs)
|
||||
|
||||
# TODO: pass tesseract binary path to the pytesseract
|
||||
image = Image.open(self.converter.get_page())
|
||||
try:
|
||||
result = pytesseract.image_to_string(
|
||||
image=image, lang=self.language
|
||||
)
|
||||
# If tesseract gives an error with a language parameter
|
||||
# re-run it with no language parameter
|
||||
except Exception as exception:
|
||||
error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception)
|
||||
|
||||
if self.binary:
|
||||
if self.language not in self.get_languages():
|
||||
error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language)
|
||||
|
||||
logger.error(error_message)
|
||||
raise OCRError(error_message)
|
||||
|
||||
return result
|
||||
@@ -5,10 +5,7 @@ from django.utils.translation import ugettext_lazy as _
|
||||
from smart_settings import Namespace
|
||||
|
||||
namespace = Namespace(name='ocr', label=_('OCR'))
|
||||
setting_tesseract_path = namespace.add_setting(
|
||||
global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract',
|
||||
help_text=_('File path to tesseract program.'), is_path=True
|
||||
)
|
||||
|
||||
setting_pdftotext_path = namespace.add_setting(
|
||||
global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext',
|
||||
help_text=_(
|
||||
@@ -18,7 +15,7 @@ setting_pdftotext_path = namespace.add_setting(
|
||||
is_path=True
|
||||
)
|
||||
setting_ocr_backend = namespace.add_setting(
|
||||
global_name='OCR_BACKEND', default='ocr.backends.tesseract.Tesseract',
|
||||
global_name='OCR_BACKEND', default='ocr.backends.pyocr.PyOCR',
|
||||
help_text=_('Full path to the backend to be used to do OCR.')
|
||||
)
|
||||
setting_auto_ocr = namespace.add_setting(
|
||||
|
||||
@@ -1,2 +1,3 @@
|
||||
# Packages to be remove during upgrades
|
||||
django-filetransfers
|
||||
pytesseract
|
||||
|
||||
@@ -27,7 +27,7 @@ fusepy==2.0.4
|
||||
|
||||
pdfminer==20140328
|
||||
pycountry==1.20
|
||||
pytesseract==0.1.6
|
||||
pyocr==0.4.4
|
||||
python-dateutil==2.5.3
|
||||
python-gnupg==0.3.9
|
||||
python-magic==0.4.12
|
||||
|
||||
Reference in New Issue
Block a user