diff --git a/docs/releases/2.2.rst b/docs/releases/2.2.rst index fc3b1a6408..07b8812949 100644 --- a/docs/releases/2.2.rst +++ b/docs/releases/2.2.rst @@ -35,10 +35,15 @@ on production install to debug errors live. - Refactor the remove document from folder view to allow removing documents from multiple folders at the same time. - Refactor the document mailing views and add support for sending multiple documents via email at the same time. - Refactor the document metadata views and add support for adding multiple metadata types to a document at the same time. +- Addition of a new OCR backend using PyOCR. This backend tries first to do OCR +using libtesseract. If libtesseract is not available the backend fallsback to +calling the Tesseract executable. Removals -------- -* None +- Removal of the OCR_TESSERACT_PATH configuration setting. +- Removal of the Tesseract OCR backend. Replaced with a PyOCR backend. +- Remove usage of pytesseract Python library. Upgrading from a previous version --------------------------------- diff --git a/mayan/apps/ocr/backends/pyocr.py b/mayan/apps/ocr/backends/pyocr.py new file mode 100644 index 0000000000..3c7c495217 --- /dev/null +++ b/mayan/apps/ocr/backends/pyocr.py @@ -0,0 +1,62 @@ +from __future__ import absolute_import, unicode_literals + +import logging + +from PIL import Image +import pyocr +import pyocr.builders + +from ..classes import OCRBackendBase +from ..exceptions import OCRError + +logger = logging.getLogger(__name__) + + +class PyOCR(OCRBackendBase): + def __init__(self, *args, **kwargs): + super(PyOCR, self).__init__(*args, **kwargs) + + tools = pyocr.get_available_tools() + if len(tools) == 0: + raise OCRError('No OCR tool found') + + # The tools are returned in the recommended order of usage + for tool in tools: + if tool.__name__ == 'pyocr.libtesseract': + self.tool = tool + + if not self.tool: + self.tool = tools[0] + + logger.debug('Will use tool \'%s\'', self.tool.get_name()) + + self.languages = self.tool.get_available_languages() + logger.debug('Available languages: %s', ', '.join(self.languages)) + + def execute(self, *args, **kwargs): + """ + Execute the command line binary of tesseract + """ + super(PyOCR, self).execute(*args, **kwargs) + + image = Image.open(self.converter.get_page()) + try: + result = self.tool.image_to_string( + image, + lang=self.language, + builder=pyocr.builders.TextBuilder() + ) + except Exception as exception: + error_message = 'Exception calling pyocr with language option: ' + '{}; {}'.format(self.language, exception) + + if self.language not in self.languages: + error_message = '{}\nThe requested OCR language "{}" is not ' + 'available and needs to be installed.\n'.format( + error_message, self.language + ) + + logger.error(error_message) + raise OCRError(error_message) + else: + return result diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py deleted file mode 100644 index cf27a3eea9..0000000000 --- a/mayan/apps/ocr/backends/tesseract.py +++ /dev/null @@ -1,59 +0,0 @@ -from __future__ import unicode_literals - -import logging - -import sh - -from PIL import Image -import pytesseract - -from ..classes import OCRBackendBase -from ..exceptions import OCRError -from ..settings import setting_tesseract_path - -logger = logging.getLogger(__name__) - - -class Tesseract(OCRBackendBase): - def __init__(self, *args, **kwargs): - super(Tesseract, self).__init__(*args, **kwargs) - try: - self.binary = sh.Command(setting_tesseract_path.value) - except sh.CommandNotFound: - self.binary = None - - def get_languages(self): - if self.binary: - result = self.binary(list_langs=True) - - return [ - language for language in result.stderr.split('\n') if language - ] - else: - return () - - def execute(self, *args, **kwargs): - """ - Execute the command line binary of tesseract - """ - super(Tesseract, self).execute(*args, **kwargs) - - # TODO: pass tesseract binary path to the pytesseract - image = Image.open(self.converter.get_page()) - try: - result = pytesseract.image_to_string( - image=image, lang=self.language - ) - # If tesseract gives an error with a language parameter - # re-run it with no language parameter - except Exception as exception: - error_message = 'Exception calling pytesseract with language option: {}; {}'.format(self.language, exception) - - if self.binary: - if self.language not in self.get_languages(): - error_message = '{}\nThe requested Tesseract language file for "{}" is not available and needs to be installed.\nIf using Debian or Ubuntu run: apt-get install tesseract-ocr-{}'.format(error_message, self.language, self.language) - - logger.error(error_message) - raise OCRError(error_message) - - return result diff --git a/mayan/apps/ocr/settings.py b/mayan/apps/ocr/settings.py index 340fbabe46..d1f3ea672c 100644 --- a/mayan/apps/ocr/settings.py +++ b/mayan/apps/ocr/settings.py @@ -5,10 +5,7 @@ from django.utils.translation import ugettext_lazy as _ from smart_settings import Namespace namespace = Namespace(name='ocr', label=_('OCR')) -setting_tesseract_path = namespace.add_setting( - global_name='OCR_TESSERACT_PATH', default='/usr/bin/tesseract', - help_text=_('File path to tesseract program.'), is_path=True -) + setting_pdftotext_path = namespace.add_setting( global_name='OCR_PDFTOTEXT_PATH', default='/usr/bin/pdftotext', help_text=_( @@ -18,7 +15,7 @@ setting_pdftotext_path = namespace.add_setting( is_path=True ) setting_ocr_backend = namespace.add_setting( - global_name='OCR_BACKEND', default='ocr.backends.tesseract.Tesseract', + global_name='OCR_BACKEND', default='ocr.backends.pyocr.PyOCR', help_text=_('Full path to the backend to be used to do OCR.') ) setting_auto_ocr = namespace.add_setting( diff --git a/removals.txt b/removals.txt index f1f6e99e49..91993470a6 100644 --- a/removals.txt +++ b/removals.txt @@ -1,2 +1,3 @@ # Packages to be remove during upgrades django-filetransfers +pytesseract diff --git a/requirements/base.txt b/requirements/base.txt index cbe0ed7a17..353757599b 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -27,7 +27,7 @@ fusepy==2.0.4 pdfminer==20140328 pycountry==1.20 -pytesseract==0.1.6 +pyocr==0.4.4 python-dateutil==2.5.3 python-gnupg==0.3.9 python-magic==0.4.12