From e0347785b7e66ba2fdd7743bb70fa598aea93435 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Tue, 24 Jun 2014 22:48:16 -0400 Subject: [PATCH] Initial implementation of OCR pluggable backends --- mayan/apps/ocr/api.py | 53 ++++++---------------------- mayan/apps/ocr/backends/__init__.py | 26 ++++++++++++++ mayan/apps/ocr/backends/tesseract.py | 45 +++++++++++++++++++++++ mayan/apps/ocr/conf/settings.py | 3 +- mayan/apps/ocr/exceptions.py | 4 +-- 5 files changed, 85 insertions(+), 46 deletions(-) create mode 100644 mayan/apps/ocr/backends/__init__.py create mode 100644 mayan/apps/ocr/backends/tesseract.py diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index 5c52388f46..86bd8435ef 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -1,4 +1,3 @@ -#Some code from http://wiki.github.com/hoffstaetter/python-tesseract from __future__ import absolute_import import codecs @@ -14,8 +13,9 @@ from common.conf.settings import TEMPORARY_DIRECTORY from converter.api import convert from documents.models import DocumentPage -from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH) -from .exceptions import TesseractError, UnpaperError +from .backends import ocr_backend +from .conf.settings import UNPAPER_PATH, LANGUAGE +from .exceptions import OCRError, UnpaperError from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT, DEFAULT_OCR_FILE_EXTENSION) from .parsers import parse_document_page @@ -24,13 +24,13 @@ from .parsers.exceptions import ParserError, ParserUnknownFile def get_language_backend(): """ - Return the OCR cleanup language backend using the selected tesseract - language in the configuration settings + Return the OCR cleanup language backend using the selected language + in the configuration settings """ try: - module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE])) + module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE])) except ImportError: - sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE) + sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE) return None return module @@ -47,45 +47,12 @@ def cleanup(filename): pass -def run_tesseract(input_filename, lang=None): - """ - Execute the command line binary of tesseract - """ - fd, filepath = tempfile.mkstemp() - os.close(fd) - ocr_output = os.extsep.join([filepath, u'txt']) - command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] - - if lang is not None: - command.extend([u'-l', lang]) - - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - error_text = proc.stderr.read() - cleanup(filepath) - cleanup(ocr_output) - if lang: - # If tesseract gives an error with a language parameter - # re-run it with no parameter again - return run_tesseract(input_filename, lang=None) - else: - raise TesseractError(error_text) - - fd = codecs.open(ocr_output, 'r', 'utf-8') - text = fd.read().strip() - fd.close() - - os.unlink(filepath) - - return text - - def do_document_ocr(queue_document): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for - the document mimetype do a visual OCR by calling tesseract + the document mimetype do a visual OCR by calling the corresponding + OCR backend """ for document_page in queue_document.document.pages.all(): try: @@ -116,7 +83,7 @@ def do_document_ocr(queue_document): pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: - ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) + ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE) document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u'Text from OCR') diff --git a/mayan/apps/ocr/backends/__init__.py b/mayan/apps/ocr/backends/__init__.py new file mode 100644 index 0000000000..d1bdbeca9d --- /dev/null +++ b/mayan/apps/ocr/backends/__init__.py @@ -0,0 +1,26 @@ +from __future__ import absolute_import + +from django.utils.importlib import import_module + +from ..conf.settings import BACKEND + + +class BackendBase(object): + def execute(input_filename, language=None): + raise NotImplemented + + +def get_ocr_backend(): + """ + Return the OCR backend using the path specified in the configuration + settings + """ + try: + module = import_module(BACKEND) + except ImportError: + sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND) + raise + else: + return module + +ocr_backend = get_ocr_backend() diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py new file mode 100644 index 0000000000..b1bfef03a1 --- /dev/null +++ b/mayan/apps/ocr/backends/tesseract.py @@ -0,0 +1,45 @@ +from __future__ import absolute_import + +import codecs +import os +import subprocess +import tempfile +import sys + +from . import BackendBase +from ..conf.settings import TESSERACT_PATH + + +def Tesseract(BackendBase): + def execute(input_filename, language=None): + """ + Execute the command line binary of tesseract + """ + fd, filepath = tempfile.mkstemp() + os.close(fd) + ocr_output = os.extsep.join([filepath, u'txt']) + command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] + + if lang is not None: + command.extend([u'-l', language]) + + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + error_text = proc.stderr.read() + cleanup(filepath) + cleanup(ocr_output) + if lang: + # If tesseract gives an error with a language parameter + # re-run it with no parameter again + return run_tesseract(input_filename, language=None) + else: + raise TesseractError(error_text) + + fd = codecs.open(ocr_output, 'r', 'utf-8') + text = fd.read().strip() + fd.close() + + os.unlink(filepath) + + return text diff --git a/mayan/apps/ocr/conf/settings.py b/mayan/apps/ocr/conf/settings.py index b7cce9421e..3ec44f1735 100644 --- a/mayan/apps/ocr/conf/settings.py +++ b/mayan/apps/ocr/conf/settings.py @@ -9,12 +9,13 @@ register_settings( module=u'ocr.conf.settings', settings=[ {'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True}, - {'name': u'TESSERACT_LANGUAGE', 'global_name': u'OCR_TESSERACT_LANGUAGE', 'default': u'eng'}, + {'name': u'LANGUAGE', 'global_name': u'OCR_LANGUAGE', 'default': u'eng'}, {'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 0, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')}, {'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')}, {'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': True, 'description': _(u'Automatically queue newly created documents for OCR.')}, {'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10}, {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, + {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')}, ] ) diff --git a/mayan/apps/ocr/exceptions.py b/mayan/apps/ocr/exceptions.py index 32ec4c4c07..1a69744f5a 100644 --- a/mayan/apps/ocr/exceptions.py +++ b/mayan/apps/ocr/exceptions.py @@ -5,9 +5,9 @@ class AlreadyQueued(Exception): pass -class TesseractError(Exception): +class OCRError(Exception): """ - Raised by tesseract + Raised by the OCR backend """ pass