Merge branch 'raw_ocr_fallback' into smart_staging

This commit is contained in:
Roberto Rosario
2011-07-19 04:22:51 -04:00
7 changed files with 64 additions and 72 deletions

View File

@@ -8,7 +8,7 @@ from documents.utils import document_save_to_temp_dir
from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError
from converter.literals import DEFAULT_PAGE_NUMBER, \
DEFAULT_OCR_FILE_FORMAT, QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
from converter import backend
@@ -100,12 +100,13 @@ def convert(input_filepath, cleanup_files=True, *args, **kwargs):
unoconv_output = result
input_filepath = result
transformations.append(
{
'transformation': TRANSFORMATION_RESIZE,
'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR)))
}
)
if size:
transformations.append(
{
'transformation': TRANSFORMATION_RESIZE,
'arguments': dict(zip([u'width', u'height'], size.split(DIMENSION_SEPARATOR)))
}
)
if zoom != 100:
transformations.append(

View File

@@ -9,7 +9,6 @@ DEFAULT_ZOOM_LEVEL = 100
DEFAULT_ROTATION = 0
DEFAULT_PAGE_NUMBER = 1
DEFAULT_FILE_FORMAT = u'jpeg'
DEFAULT_OCR_FILE_FORMAT = u'tif'
QUALITY_DEFAULT = u'quality_default'
QUALITY_LOW = u'quality_low'

View File

@@ -9,8 +9,10 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from common import TEMPORARY_DIRECTORY
from converter.api import convert
from documents.models import DocumentPage
from documents.utils import document_save_to_temp_dir
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
@@ -18,6 +20,7 @@ from ocr.exceptions import TesseractError
from ocr.conf.settings import UNPAPER_PATH
from ocr.parsers import parse_document_page
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT
def get_language_backend():
@@ -45,11 +48,14 @@ def cleanup(filename):
pass
def run_tesseract(input_filename, output_filename_base, lang=None):
def run_tesseract(input_filename, lang=None):
"""
Execute the command line binary of tesseract
"""
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(output_filename_base)]
fd, filepath = tempfile.mkstemp()
os.close(fd)
ocr_output = os.extsep.join([filepath, u'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if lang is not None:
command += [u'-l', lang]
@@ -57,41 +63,57 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
cleanup(filepath)
cleanup(ocr_output)
raise TesseractError(error_text)
return codecs.open(ocr_output, 'r', 'utf-8'), ocr_output
def do_document_ocr(document):
def do_document_ocr(queue_document):
"""
first try to extract text from document pages using the registered
parser if the parser fails or if there is no parser registered for
Try first to extract text from document pages using the registered
parser, if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling tesseract
"""
for document_page in document.documentpage_set.all():
for document_page in queue_document.document.documentpage_set.all():
try:
# Try to extract text by means of a parser
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
pass
#desc, filepath = tempfile.mkstemp()
#imagefile = None
#source = u''
#imagefile = convert_document_for_ocr(document, page=document_page.page_number)
#run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
#ocr_output = os.extsep.join([filepath, u'txt'])
#source = _(u'Text from OCR')
#f = codecs.open(ocr_output, 'r', 'utf-8')
#document_page.content = ocr_cleanup(f.read().strip())
#document_page.page_label = source
#document_page.save()
#f.close()
#cleanup(ocr_output)
#finally:
# pass
#os.close(desc)
#cleanup(filepath)
#if imagefile:
# cleanup(imagefile)
transformations = []
document_transformations, warnings = document_page.get_transformation_list()
ocr_transformations, warnings = queue_document.get_transformation_list()
transformations.extend(document_transformations)
transformations.extend(ocr_transformations)
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
document_filepath = os.path.join(TEMPORARY_DIRECTORY, document_page.document.uuid)
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
document.save_to_file(document_filepath)
transformed_filepath=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, page=document_page.page_number, transformations=transformations)
execute_unpaper(input_filepath=transformed_filepath, output_filepath=unpaper_output_filepath)
# Convert to TIFF
pre_ocr_filepath = output_filepath=convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
# Tesseract needs an explicit file extension
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_FORMAT])
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
try:
fd, ocr_output = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)
document_page.content = ocr_cleanup(fd.read().strip())
document_page.page_label = _(u'Text from OCR')
document_page.save()
fd.close()
cleanup(ocr_output)
finally:
cleanup(pre_ocr_filepath_w_ext)
cleanup(transformed_filepath)
cleanup(document_filepath)
cleanup(unpaper_output_filepath)
def ocr_cleanup(text):
@@ -139,38 +161,3 @@ def execute_unpaper(input_filepath, output_filepath):
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
'''
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
try:
document_page = document.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
#Apply default transformations
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
'''

View File

@@ -16,6 +16,5 @@ register_settings(
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
]
)

View File

@@ -19,3 +19,6 @@ QUEUEDOCUMENT_STATE_CHOICES = (
(QUEUEDOCUMENT_STATE_PROCESSING, _(u'processing')),
(QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
)
DEFAULT_OCR_FILE_FORMAT = u'tif'
UNPAPER_FILE_FORMAT = u'pnm'

View File

@@ -48,6 +48,9 @@ class QueueDocument(models.Model):
ordering = ('datetime_submitted',)
verbose_name = _(u'queue document')
verbose_name_plural = _(u'queue documents')
def get_transformation_list(self):
return QueueTransformation.objects.get_for_object_as_list(self)
def __unicode__(self):
try:

View File

@@ -56,7 +56,7 @@ def task_process_queue_document(queue_document_id):
queue_document.result = task_process_queue_document.request.id
queue_document.save()
try:
do_document_ocr(queue_document.document)
do_document_ocr(queue_document)
queue_document.delete()
except Exception, e:
queue_document.state = QUEUEDOCUMENT_STATE_ERROR