Initial implementation of OCR pluggable backends

This commit is contained in:
Roberto Rosario
2014-06-24 22:48:16 -04:00
parent 198402385e
commit e0347785b7
5 changed files with 85 additions and 46 deletions

View File

@@ -1,4 +1,3 @@
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
from __future__ import absolute_import
import codecs
@@ -14,8 +13,9 @@ from common.conf.settings import TEMPORARY_DIRECTORY
from converter.api import convert
from documents.models import DocumentPage
from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
from .exceptions import TesseractError, UnpaperError
from .backends import ocr_backend
from .conf.settings import UNPAPER_PATH, LANGUAGE
from .exceptions import OCRError, UnpaperError
from .literals import (DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT,
DEFAULT_OCR_FILE_EXTENSION)
from .parsers import parse_document_page
@@ -24,13 +24,13 @@ from .parsers.exceptions import ParserError, ParserUnknownFile
def get_language_backend():
"""
Return the OCR cleanup language backend using the selected tesseract
language in the configuration settings
Return the OCR cleanup language backend using the selected language
in the configuration settings
"""
try:
module = import_module(u'.'.join([u'ocr', u'lang', TESSERACT_LANGUAGE]))
module = import_module(u'.'.join([u'ocr', u'lang', LANGUAGE]))
except ImportError:
sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % TESSERACT_LANGUAGE)
sys.stderr.write(u'\nWarning: No OCR app language backend for language: %s\n\n' % LANGUAGE)
return None
return module
@@ -47,45 +47,12 @@ def cleanup(filename):
pass
def run_tesseract(input_filename, lang=None):
"""
Execute the command line binary of tesseract
"""
fd, filepath = tempfile.mkstemp()
os.close(fd)
ocr_output = os.extsep.join([filepath, u'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if lang is not None:
command.extend([u'-l', lang])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
cleanup(filepath)
cleanup(ocr_output)
if lang:
# If tesseract gives an error with a language parameter
# re-run it with no parameter again
return run_tesseract(input_filename, lang=None)
else:
raise TesseractError(error_text)
fd = codecs.open(ocr_output, 'r', 'utf-8')
text = fd.read().strip()
fd.close()
os.unlink(filepath)
return text
def do_document_ocr(queue_document):
"""
Try first to extract text from document pages using the registered
parser, if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling tesseract
the document mimetype do a visual OCR by calling the corresponding
OCR backend
"""
for document_page in queue_document.document.pages.all():
try:
@@ -116,7 +83,7 @@ def do_document_ocr(queue_document):
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
try:
ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, LANGUAGE)
document_page.content = ocr_cleanup(ocr_text)
document_page.page_label = _(u'Text from OCR')

View File

@@ -0,0 +1,26 @@
from __future__ import absolute_import
from django.utils.importlib import import_module
from ..conf.settings import BACKEND
class BackendBase(object):
def execute(input_filename, language=None):
raise NotImplemented
def get_ocr_backend():
"""
Return the OCR backend using the path specified in the configuration
settings
"""
try:
module = import_module(BACKEND)
except ImportError:
sys.stderr.write(u'\nWarning: No OCR backend named: %s\n\n' % BACKEND)
raise
else:
return module
ocr_backend = get_ocr_backend()

View File

@@ -0,0 +1,45 @@
from __future__ import absolute_import
import codecs
import os
import subprocess
import tempfile
import sys
from . import BackendBase
from ..conf.settings import TESSERACT_PATH
def Tesseract(BackendBase):
def execute(input_filename, language=None):
"""
Execute the command line binary of tesseract
"""
fd, filepath = tempfile.mkstemp()
os.close(fd)
ocr_output = os.extsep.join([filepath, u'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if lang is not None:
command.extend([u'-l', language])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
cleanup(filepath)
cleanup(ocr_output)
if lang:
# If tesseract gives an error with a language parameter
# re-run it with no parameter again
return run_tesseract(input_filename, language=None)
else:
raise TesseractError(error_text)
fd = codecs.open(ocr_output, 'r', 'utf-8')
text = fd.read().strip()
fd.close()
os.unlink(filepath)
return text

View File

@@ -9,12 +9,13 @@ register_settings(
module=u'ocr.conf.settings',
settings=[
{'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True},
{'name': u'TESSERACT_LANGUAGE', 'global_name': u'OCR_TESSERACT_LANGUAGE', 'default': u'eng'},
{'name': u'LANGUAGE', 'global_name': u'OCR_LANGUAGE', 'default': u'eng'},
{'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 0, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')},
{'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')},
{'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': True, 'description': _(u'Automatically queue newly created documents for OCR.')},
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
]
)

View File

@@ -5,9 +5,9 @@ class AlreadyQueued(Exception):
pass
class TesseractError(Exception):
class OCRError(Exception):
"""
Raised by tesseract
Raised by the OCR backend
"""
pass