Refacto parser system to be class based, add poppler based PDF parser, allow multiple parsers for each mimetype with fallback

This commit is contained in:
Roberto Rosario
2012-05-30 12:57:25 -04:00
parent 036ec09234
commit babd3ec2f3

View File

@@ -1,5 +1,6 @@
import slate
import logging
import tempfile
from django.utils.translation import ugettext as _
@@ -7,6 +8,8 @@ from converter import office_converter
from converter.office_converter import OfficeConverter
from converter.exceptions import OfficeConversionError
from documents.utils import document_save_to_temp_dir
from common.utils import copyfile
from common.conf.settings import TEMPORARY_DIRECTORY
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
@@ -15,48 +18,16 @@ mimetype_registry = {}
logger = logging.getLogger(__name__)
def register_parser(function, mimetype=None, mimetypes=None):
if mimetypes:
for mimetype in mimetypes:
mimetype_registry[mimetype] = {'function': function}
else:
mimetype_registry[mimetype] = {'function': function}
def pdf_parser(document_page, descriptor=None):
if not descriptor:
descriptor = document_page.document_version.open()
pdf_pages = slate.PDF(descriptor)
descriptor.close()
if pdf_pages[document_page.page_number - 1] == '\x0c':
raise ParserError
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()
def office_parser(document_page):
logger.debug('executing')
try:
office_converter = OfficeConverter()
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
logger.debug('document_file: %s', document_file)
office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
if office_converter.exists:
input_filepath = office_converter.output_filepath
logger.debug('office_converter.output_filepath: %s', input_filepath)
pdf_parser(document_page, descriptor=open(input_filepath))
else:
raise ParserError
except OfficeConversionError, msg:
print msg
raise ParserError
def register_parser(mimetypes, parsers):
for mimetype in mimetypes:
for parser in parsers:
try:
parser_instance = parser()
except ParserError:
# If parser fails initialization is not added to the list for this mimetype
pass
else:
mimetype_registry.setdefault(mimetype, []).append(parser_instance)
def parse_document_page(document_page):
@@ -65,10 +36,129 @@ def parse_document_page(document_page):
logger.debug('mimetype: %s' % document_page.document.file_mimetype)
try:
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
for parser in mimetype_registry[document_page.document.file_mimetype]['function']:
try:
parser.parse(document_page)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
break;
except KeyError:
raise ParserUnknownFile
register_parser(mimetype=u'application/pdf', function=pdf_parser)
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, function=office_parser)
class Parser(object):
"""
Parser base class
"""
def parse(self, document_page):
raise NotImplementedError("Your %s class has not defined a parse() method, which is required." % self.__class__.__name__)
class SlateParser(Parser):
"""
Parser for PDF files using the slate library for Python
"""
def parse(document_page, descriptor=None):
if not descriptor:
descriptor = document_page.document_version.open()
pdf_pages = slate.PDF(descriptor)
descriptor.close()
if pdf_pages[document_page.page_number - 1] == '\x0c':
raise ParserError
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()
class OfficeParser(Parser):
"""
Parser for office document formats
"""
def parse(document_page):
logger.debug('executing')
try:
office_converter = OfficeConverter()
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
logger.debug('document_file: %s', document_file)
office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
if office_converter.exists:
input_filepath = office_converter.output_filepath
logger.debug('office_converter.output_filepath: %s', input_filepath)
# Now that the office document has been converted to PDF
# call the coresponding PDF parser in this new file
parse_document_page(document_page, descriptor=open(input_filepath))
else:
raise ParserError
except OfficeConversionError, msg:
logger.error(msg)
raise ParserError
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else u'/usr/bin/pdftotext'
if not os.path.exists(self.pdftotext_path):
raise ParserError('cannot find pdftotext executable')
logger.debug('self.pdftotext_path: %s' % self.pdftotext_path)
def parse(document_page, descriptor=None):
logger.debug('parsing PDF')
pagenum = str(document_page.page_number)
if descriptor:
destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
copyfile(descriptor, destination_descriptor)
document_file = temp_filepath
else:
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
logger.debug('document_file: %s', document_file)
logger.debug('parsing PDF page %s' % pagenum)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(pagenum)
command.append('-l')
command.append(pagenum)
command.append(document_file)
command.append('-')
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
raise ParserError
output = proc.stdout.read()
numalpha = len(filter(str.isalpha, output))
numother = len(filter(notalphaorspace, output))
logger.debug("Numalpha = %d Numother = %d" % (numalpha, numother))
if numother > numalpha:
logger.debug("parser error... probably scanned pdf.")
raise ParserError
document_page.content = output
document_page.page_label = _(u'Text extracted from PDF')
document_page.save()
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])