diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index 6a91d392d4..0eb1d7e9bd 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -1,5 +1,6 @@ import slate import logging +import tempfile from django.utils.translation import ugettext as _ @@ -7,6 +8,8 @@ from converter import office_converter from converter.office_converter import OfficeConverter from converter.exceptions import OfficeConversionError from documents.utils import document_save_to_temp_dir +from common.utils import copyfile +from common.conf.settings import TEMPORARY_DIRECTORY from ocr.parsers.exceptions import ParserError, ParserUnknownFile @@ -15,48 +18,16 @@ mimetype_registry = {} logger = logging.getLogger(__name__) -def register_parser(function, mimetype=None, mimetypes=None): - if mimetypes: - for mimetype in mimetypes: - mimetype_registry[mimetype] = {'function': function} - else: - mimetype_registry[mimetype] = {'function': function} - - -def pdf_parser(document_page, descriptor=None): - if not descriptor: - descriptor = document_page.document_version.open() - - pdf_pages = slate.PDF(descriptor) - descriptor.close() - - if pdf_pages[document_page.page_number - 1] == '\x0c': - raise ParserError - - document_page.content = pdf_pages[document_page.page_number - 1] - document_page.page_label = _(u'Text extracted from PDF') - document_page.save() - - -def office_parser(document_page): - logger.debug('executing') - try: - office_converter = OfficeConverter() - document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) - logger.debug('document_file: %s', document_file) - - office_converter.convert(document_file, mimetype=document_page.document.file_mimetype) - if office_converter.exists: - input_filepath = office_converter.output_filepath - logger.debug('office_converter.output_filepath: %s', input_filepath) - - pdf_parser(document_page, descriptor=open(input_filepath)) - else: - raise ParserError - - except OfficeConversionError, msg: - print msg - raise ParserError +def register_parser(mimetypes, parsers): + for mimetype in mimetypes: + for parser in parsers: + try: + parser_instance = parser() + except ParserError: + # If parser fails initialization is not added to the list for this mimetype + pass + else: + mimetype_registry.setdefault(mimetype, []).append(parser_instance) def parse_document_page(document_page): @@ -65,10 +36,129 @@ def parse_document_page(document_page): logger.debug('mimetype: %s' % document_page.document.file_mimetype) try: - mimetype_registry[document_page.document.file_mimetype]['function'](document_page) + for parser in mimetype_registry[document_page.document.file_mimetype]['function']: + try: + parser.parse(document_page) + except ParserError: + # If parser raises error, try next parser in the list + pass + else: + # If parser was successfull there is no need to try + # others in the list for this mimetype + break; except KeyError: raise ParserUnknownFile -register_parser(mimetype=u'application/pdf', function=pdf_parser) -register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, function=office_parser) +class Parser(object): + """ + Parser base class + """ + + def parse(self, document_page): + raise NotImplementedError("Your %s class has not defined a parse() method, which is required." % self.__class__.__name__) + + +class SlateParser(Parser): + """ + Parser for PDF files using the slate library for Python + """ + def parse(document_page, descriptor=None): + if not descriptor: + descriptor = document_page.document_version.open() + + pdf_pages = slate.PDF(descriptor) + descriptor.close() + + if pdf_pages[document_page.page_number - 1] == '\x0c': + raise ParserError + + document_page.content = pdf_pages[document_page.page_number - 1] + document_page.page_label = _(u'Text extracted from PDF') + document_page.save() + + +class OfficeParser(Parser): + """ + Parser for office document formats + """ + def parse(document_page): + logger.debug('executing') + try: + office_converter = OfficeConverter() + document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) + logger.debug('document_file: %s', document_file) + + office_converter.convert(document_file, mimetype=document_page.document.file_mimetype) + if office_converter.exists: + input_filepath = office_converter.output_filepath + logger.debug('office_converter.output_filepath: %s', input_filepath) + + # Now that the office document has been converted to PDF + # call the coresponding PDF parser in this new file + parse_document_page(document_page, descriptor=open(input_filepath)) + else: + raise ParserError + + except OfficeConversionError, msg: + logger.error(msg) + raise ParserError + + +class PopplerParser(Parser): + """ + PDF parser using the pdftotext execute from the poppler package + """ + def __init__(self): + self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else u'/usr/bin/pdftotext' + if not os.path.exists(self.pdftotext_path): + raise ParserError('cannot find pdftotext executable') + logger.debug('self.pdftotext_path: %s' % self.pdftotext_path) + + def parse(document_page, descriptor=None): + logger.debug('parsing PDF') + pagenum = str(document_page.page_number) + + if descriptor: + destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY) + copyfile(descriptor, destination_descriptor) + document_file = temp_filepath + else: + document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) + + logger.debug('document_file: %s', document_file) + + logger.debug('parsing PDF page %s' % pagenum) + + command = [] + command.append(self.pdftotext_path) + command.append('-f') + command.append(pagenum) + command.append('-l') + command.append(pagenum) + command.append(document_file) + command.append('-') + + proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) + return_code = proc.wait() + if return_code != 0: + logger.error(proc.stderr.readline()) + raise ParserError + + output = proc.stdout.read() + numalpha = len(filter(str.isalpha, output)) + numother = len(filter(notalphaorspace, output)) + + logger.debug("Numalpha = %d Numother = %d" % (numalpha, numother)) + + if numother > numalpha: + logger.debug("parser error... probably scanned pdf.") + raise ParserError + + document_page.content = output + document_page.page_label = _(u'Text extracted from PDF') + document_page.save() + + +register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser]) +register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])