Removed pdftotext from the requirements, move unpaper calling to the OCR app

This commit is contained in:
Roberto Rosario
2011-07-18 04:06:19 -04:00
parent ac43e294b3
commit 5bfd607b31
5 changed files with 89 additions and 114 deletions

View File

@@ -5,8 +5,6 @@ import hashlib
from common import TEMPORARY_DIRECTORY
from documents.utils import document_save_to_temp_dir
from converter.conf.settings import UNPAPER_PATH
from converter.conf.settings import OCR_OPTIONS
from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError
from converter.literals import DEFAULT_PAGE_NUMBER, \
@@ -36,21 +34,6 @@ def cleanup(filename):
pass
def execute_unpaper(input_filepath, output_filepath):
"""
Executes the program unpaper using subprocess's Popen
"""
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
def execute_unoconv(input_filepath, arguments=''):
"""
Executes the program unoconv using subprocess's Popen
@@ -164,38 +147,6 @@ def get_document_dimensions(document, *args, **kwargs):
return [0, 0]
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
try:
document_page = document.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
#Apply default transformations
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
def get_available_transformations_choices():
result = []
for transformation in backend.get_available_transformations():

View File

@@ -9,12 +9,11 @@ register_settings(
settings=[
{'name': u'IM_CONVERT_PATH', 'global_name': u'CONVERTER_IM_CONVERT_PATH', 'default': u'/usr/bin/convert', 'description': _(u'File path to imagemagick\'s convert program.'), 'exists': True},
{'name': u'IM_IDENTIFY_PATH', 'global_name': u'CONVERTER_IM_IDENTIFY_PATH', 'default': u'/usr/bin/identify', 'description': _(u'File path to imagemagick\'s identify program.'), 'exists': True},
{'name': u'UNPAPER_PATH', 'global_name': u'CONVERTER_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'GM_PATH', 'global_name': u'CONVERTER_GM_PATH', 'default': u'/usr/bin/gm', 'description': _(u'File path to graphicsmagick\'s program.'), 'exists': True},
{'name': u'GM_SETTINGS', 'global_name': u'CONVERTER_GM_SETTINGS', 'default': u''},
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
{'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
#{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
{'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''},
{'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''},
{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},

View File

@@ -9,13 +9,15 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from converter.api import convert_document_for_ocr
from converter.api import convert
from documents.models import DocumentPage
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
from ocr.conf.settings import PDFTOTEXT_PATH
from ocr.exceptions import TesseractError, PdftotextError
from ocr.exceptions import TesseractError
from ocr.conf.settings import UNPAPER_PATH
from ocr.parsers import parse_document_page
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
def get_language_backend():
@@ -30,7 +32,7 @@ def get_language_backend():
return None
return module
backend = get_language_backend()
language_backend = get_language_backend()
def cleanup(filename):
@@ -58,62 +60,38 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
raise TesseractError(error_text)
def run_pdftotext(input_filename, output_filename, page_number=None):
"""
Execute the command line binary of pdftotext
"""
command = [unicode(PDFTOTEXT_PATH)]
if page_number:
command.extend([u'-nopgbrk', u'-f', unicode(page_number), u'-l', unicode(page_number)])
command.extend([unicode(input_filename), unicode(output_filename)])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
error_text = proc.stderr.read()
raise PdftotextError(error_text)
def do_document_ocr(document):
"""
Do OCR on all the pages of the given document object, first
trying to extract text from PDF using pdftotext then by calling
tesseract
first try to extract text from document pages using the registered
parser if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling tesseract
"""
for document_page in document.documentpage_set.all():
desc, filepath = tempfile.mkstemp()
imagefile = None
source = u''
try:
if document.file_mimetype == u'application/pdf':
pdf_filename = os.extsep.join([filepath, u'pdf'])
document.save_to_file(pdf_filename)
run_pdftotext(pdf_filename, filepath, document_page.page_number)
cleanup(pdf_filename)
if os.stat(filepath).st_size == 0:
#PDF page had no text, run tesseract on the page
imagefile = convert_document_for_ocr(document, page=document_page.page_number)
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, u'txt'])
source = _(u'Text from OCR')
else:
ocr_output = filepath
source = _(u'Text extracted from PDF')
else:
imagefile = convert_document_for_ocr(document, page=document_page.page_number)
run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
ocr_output = os.extsep.join([filepath, u'txt'])
source = _(u'Text from OCR')
f = codecs.open(ocr_output, 'r', 'utf-8')
document_page.content = ocr_cleanup(f.read().strip())
document_page.page_label = source
document_page.save()
f.close()
cleanup(ocr_output)
finally:
os.close(desc)
cleanup(filepath)
if imagefile:
cleanup(imagefile)
# Try to extract text by means of a parser
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
pass
#desc, filepath = tempfile.mkstemp()
#imagefile = None
#source = u''
#imagefile = convert_document_for_ocr(document, page=document_page.page_number)
#run_tesseract(imagefile, filepath, TESSERACT_LANGUAGE)
#ocr_output = os.extsep.join([filepath, u'txt'])
#source = _(u'Text from OCR')
#f = codecs.open(ocr_output, 'r', 'utf-8')
#document_page.content = ocr_cleanup(f.read().strip())
#document_page.page_label = source
#document_page.save()
#f.close()
#cleanup(ocr_output)
#finally:
# pass
#os.close(desc)
#cleanup(filepath)
#if imagefile:
# cleanup(imagefile)
def ocr_cleanup(text):
@@ -126,8 +104,8 @@ def ocr_cleanup(text):
for line in text.splitlines():
line = line.strip()
for word in line.split():
if backend:
result = backend.check_word(word)
if language_backend:
result = language_backend.check_word(word)
else:
result = word
if result:
@@ -146,3 +124,53 @@ def clean_pages():
if page.content:
page.content = ocr_cleanup(page.content)
page.save()
def execute_unpaper(input_filepath, output_filepath):
"""
Executes the program unpaper using subprocess's Popen
"""
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
raise UnpaperError(proc.stderr.readline())
'''
def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
#Extract document file
input_filepath = document_save_to_temp_dir(document, document.uuid)
#Convert for OCR
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
try:
document_page = document.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
#Apply default transformations
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, transformations=transformations, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper
execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
# Convert to tif
backend.convert_file(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
finally:
cleanup(transformation_output_file)
cleanup(unpaper_input_file)
cleanup(unpaper_output_file)
return convert_output_file
'''

View File

@@ -13,8 +13,9 @@ register_settings(
{'name': u'REPLICATION_DELAY', 'global_name': u'OCR_REPLICATION_DELAY', 'default': 10, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.')},
{'name': u'NODE_CONCURRENT_EXECUTION', 'global_name': u'OCR_NODE_CONCURRENT_EXECUTION', 'default': 1, 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.')},
{'name': u'AUTOMATIC_OCR', 'global_name': u'OCR_AUTOMATIC_OCR', 'default': False, 'description': _(u'Automatically queue newly created documents for OCR.')},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
{'name': u'QUEUE_PROCESSING_INTERVAL', 'global_name': u'OCR_QUEUE_PROCESSING_INTERVAL', 'default': 10},
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')}
{'name': u'CACHE_URI', 'global_name': u'OCR_CACHE_URI', 'default': None, 'description': _(u'URI in the form: "memcached://127.0.0.1:11211/" to specify a cache backend to use for locking. Multiple hosts can be specified separated by a semicolon.')},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PARSERS_PDFTOTEXT_PATH', 'global_name': u'OCR_PARSERS_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'exists': True},
]
)

View File

@@ -4,7 +4,3 @@ class AlreadyQueued(Exception):
class TesseractError(Exception):
pass
class PdftotextError(Exception):
pass