From 5bfd607b31d72a9d8db59111d042f722fe354208 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Mon, 18 Jul 2011 04:06:19 -0400 Subject: [PATCH] Removed pdftotext from the requirements, move unpaper calling to the OCR app --- apps/converter/api.py | 49 ----------- apps/converter/conf/settings.py | 3 +- apps/ocr/api.py | 142 +++++++++++++++++++------------- apps/ocr/conf/settings.py | 5 +- apps/ocr/exceptions.py | 4 - 5 files changed, 89 insertions(+), 114 deletions(-) diff --git a/apps/converter/api.py b/apps/converter/api.py index 71a188a36d..665a980c27 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -5,8 +5,6 @@ import hashlib from common import TEMPORARY_DIRECTORY from documents.utils import document_save_to_temp_dir -from converter.conf.settings import UNPAPER_PATH -from converter.conf.settings import OCR_OPTIONS from converter.conf.settings import UNOCONV_PATH from converter.exceptions import UnpaperError, OfficeConversionError from converter.literals import DEFAULT_PAGE_NUMBER, \ @@ -36,21 +34,6 @@ def cleanup(filename): pass -def execute_unpaper(input_filepath, output_filepath): - """ - Executes the program unpaper using subprocess's Popen - """ - command = [] - command.append(UNPAPER_PATH) - command.append(u'--overwrite') - command.append(input_filepath) - command.append(output_filepath) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - raise UnpaperError(proc.stderr.readline()) - - def execute_unoconv(input_filepath, arguments=''): """ Executes the program unoconv using subprocess's Popen @@ -164,38 +147,6 @@ def get_document_dimensions(document, *args, **kwargs): return [0, 0] -def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): - #Extract document file - input_filepath = document_save_to_temp_dir(document, document.uuid) - - #Convert for OCR - temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) - temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) - transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) - unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) - unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) - convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) - - try: - document_page = document.documentpage_set.get(page_number=page) - transformations, warnings = document_page.get_transformation_list() - - #Apply default transformations - backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) - #Do OCR operations - backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) - # Process by unpaper - execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) - # Convert to tif - backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) - finally: - cleanup(transformation_output_file) - cleanup(unpaper_input_file) - cleanup(unpaper_output_file) - - return convert_output_file - - def get_available_transformations_choices(): result = [] for transformation in backend.get_available_transformations(): diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py index fcaa1ec9b0..95aee33b92 100644 --- a/apps/converter/conf/settings.py +++ b/apps/converter/conf/settings.py @@ -9,12 +9,11 @@ register_settings( settings=[ {'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True}, {'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True}, - {'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, {'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True}, {'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''}, {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')}, {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True}, - {'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, + #{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, {'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''}, {'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''}, {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'}, diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 88e9c20356..ec89a669c9 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -9,13 +9,15 @@ import sys from django.utils.translation import ugettext as _ from django.utils.importlib import import_module -from converter.api import convert_document_for_ocr +from converter.api import convert from documents.models import DocumentPage from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_LANGUAGE -from ocr.conf.settings import PDFTOTEXT_PATH -from ocr.exceptions import TesseractError, PdftotextError +from ocr.exceptions import TesseractError +from ocr.conf.settings import UNPAPER_PATH +from ocr.parsers import parse_document_page +from ocr.parsers.exceptions import ParserError, ParserUnknownFile def get_language_backend(): @@ -30,7 +32,7 @@ def get_language_backend(): return None return module -backend = get_language_backend() +language_backend = get_language_backend() def cleanup(filename): @@ -58,62 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None): raise TesseractError(error_text) -def run_pdftotext(input_filename, output_filename, page_number=None): - """ - Execute the command line binary of pdftotext - """ - command = [unicode(PDFTOTEXT_PATH)] - if page_number: - command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)]) - command.extend([unicode(input_filename), unicode(output_filename)]) - proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - return_code = proc.wait() - if return_code != 0: - error_text = proc.stderr.read() - raise PdftotextError(error_text) - - def do_document_ocr(document): """ - Do OCR on all the pages of the given document object, first - trying to extract text from PDF using pdftotext then by calling - tesseract + first try to extract text from document pages using the registered + parser if the parser fails or if there is no parser registered for + the document mimetype do a visual OCR by calling tesseract """ for document_page in document.documentpage_set.all(): - desc, filepath = tempfile.mkstemp() - imagefile = None - source = u'' try: - if document.file_mimetype == u'application/pdf': - pdf_filename = os.extsep.join([filepath, u'pdf']) - document.save_to_file(pdf_filename) - run_pdftotext(pdf_filename, filepath, document_page.page_number) - cleanup(pdf_filename) - if os.stat(filepath).st_size == 0: - #PDF page had no text, run tesseract on the page - imagefile = convert_document_for_ocr(document, page=document_page.page_number) - run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) - ocr_output = os.extsep.join([filepath, u'txt']) - source = _(u'Text from OCR') - else: - ocr_output = filepath - source = _(u'Text extracted from PDF') - else: - imagefile = convert_document_for_ocr(document, page=document_page.page_number) - run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) - ocr_output = os.extsep.join([filepath, u'txt']) - source = _(u'Text from OCR') - f = codecs.open(ocr_output, 'r', 'utf-8') - document_page.content = ocr_cleanup(f.read().strip()) - document_page.page_label = source - document_page.save() - f.close() - cleanup(ocr_output) - finally: - os.close(desc) - cleanup(filepath) - if imagefile: - cleanup(imagefile) + # Try to extract text by means of a parser + parse_document_page(document_page) + except (ParserError, ParserUnknownFile): + # Fall back to doing visual OCR + pass + #desc, filepath = tempfile.mkstemp() + #imagefile = None + #source = u'' + #imagefile = convert_document_for_ocr(document, page=document_page.page_number) + #run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE) + #ocr_output = os.extsep.join([filepath, u'txt']) + #source = _(u'Text from OCR') + #f = codecs.open(ocr_output, 'r', 'utf-8') + #document_page.content = ocr_cleanup(f.read().strip()) + #document_page.page_label = source + #document_page.save() + #f.close() + #cleanup(ocr_output) + #finally: + # pass + #os.close(desc) + #cleanup(filepath) + #if imagefile: + # cleanup(imagefile) def ocr_cleanup(text): @@ -126,8 +104,8 @@ def ocr_cleanup(text): for line in text.splitlines(): line = line.strip() for word in line.split(): - if backend: - result = backend.check_word(word) + if language_backend: + result = language_backend.check_word(word) else: result = word if result: @@ -146,3 +124,53 @@ def clean_pages(): if page.content: page.content = ocr_cleanup(page.content) page.save() + + +def execute_unpaper(input_filepath, output_filepath): + """ + Executes the program unpaper using subprocess's Popen + """ + command = [] + command.append(UNPAPER_PATH) + command.append(u'--overwrite') + command.append(input_filepath) + command.append(output_filepath) + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + raise UnpaperError(proc.stderr.readline()) + +''' +def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT): + #Extract document file + input_filepath = document_save_to_temp_dir(document, document.uuid) + + #Convert for OCR + temp_filename, separator = os.path.splitext(os.path.basename(input_filepath)) + temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename) + transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format) + unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep) + unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep) + convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format) + + try: + document_page = document.documentpage_set.get(page_number=page) + transformations, warnings = document_page.get_transformation_list() + + #Apply default transformations + backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file) + #Do OCR operations + backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file) + # Process by unpaper + execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file) + # Convert to tif + backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file) + finally: + cleanup(transformation_output_file) + cleanup(unpaper_input_file) + cleanup(unpaper_output_file) + + return convert_output_file +''' + + diff --git a/apps/ocr/conf/settings.py b/apps/ocr/conf/settings.py index e9024b7152..52785f46ac 100644 --- a/apps/ocr/conf/settings.py +++ b/apps/ocr/conf/settings.py @@ -13,8 +13,9 @@ register_settings( {'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')}, {'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')}, {'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')}, - {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True}, {'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10}, - {'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')} + {'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}, + {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, + {'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True}, ] ) diff --git a/apps/ocr/exceptions.py b/apps/ocr/exceptions.py index 4bfa8f725a..b1ec8c3fe3 100644 --- a/apps/ocr/exceptions.py +++ b/apps/ocr/exceptions.py @@ -4,7 +4,3 @@ class AlreadyQueued(Exception): class TesseractError(Exception): pass - - -class PdftotextError(Exception): - pass