From 922971274fc3c79aef400d8b9e4f0c333dc648f4 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 1 Dec 2011 04:54:14 -0400 Subject: [PATCH] Add office document text extractor --- apps/ocr/parsers/__init__.py | 55 +++++++++++++++++++++++++++++++----- 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index 8ed4be7cb7..8fac71c084 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -1,20 +1,35 @@ import slate +import logging from django.utils.translation import ugettext as _ +from converter import office_converter +from converter import office_converter +from converter.office_converter import OfficeConverter +from converter.exceptions import OfficeBackendError, OfficeConversionError +from documents.utils import document_save_to_temp_dir + from ocr.parsers.exceptions import ParserError, ParserUnknownFile + mimetype_registry = {} +logger = logging.getLogger(__name__) -def register_parser(mimetype, function): - mimetype_registry[mimetype] = {'function': function} +def register_parser(function, mimetype=None, mimetypes=None): + if mimetypes: + for mimetype in mimetypes: + mimetype_registry[mimetype] = {'function': function} + else: + mimetype_registry[mimetype] = {'function': function} -def pdf_parser(document_page): - fd = document_page.document.open() - pdf_pages = slate.PDF(fd) - fd.close() +def pdf_parser(document_page, descriptor=None): + if not descriptor: + descriptor = document_page.document.open() + + pdf_pages = slate.PDF(descriptor) + descriptor.close() if pdf_pages[document_page.page_number - 1] == '\x0c': raise ParserError @@ -24,11 +39,37 @@ def pdf_parser(document_page): document_page.save() +def office_parser(document_page): + logger.debug('executing') + try: + office_converter = OfficeConverter() + document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum) + logger.debug('document_file: %s', document_file) + + office_converter.convert(document_file, mimetype=document_page.document.file_mimetype) + if office_converter.exists: + input_filepath = office_converter.output_filepath + logger.debug('office_converter.output_filepath: %s', input_filepath) + + pdf_parser(document_page, descriptor=open(input_filepath)) + else: + raise ParserError + + except OfficeConversionError, msg: + print msg + raise ParserError + + def parse_document_page(document_page): + logger.debug('executing') + logger.debug('document_page: %s' % document_page) + logger.debug('mimetype: %s' % document_page.document.file_mimetype) + try: mimetype_registry[document_page.document.file_mimetype]['function'](document_page) except KeyError: raise ParserUnknownFile -register_parser('application/pdf', pdf_parser) +register_parser(mimetype=u'application/pdf', function=pdf_parser) +register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, function=office_parser)