Add office document text extractor
This commit is contained in:
@@ -1,20 +1,35 @@
|
||||
import slate
|
||||
import logging
|
||||
|
||||
from django.utils.translation import ugettext as _
|
||||
|
||||
from converter import office_converter
|
||||
from converter import office_converter
|
||||
from converter.office_converter import OfficeConverter
|
||||
from converter.exceptions import OfficeBackendError, OfficeConversionError
|
||||
from documents.utils import document_save_to_temp_dir
|
||||
|
||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
|
||||
mimetype_registry = {}
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def register_parser(mimetype, function):
|
||||
mimetype_registry[mimetype] = {'function': function}
|
||||
def register_parser(function, mimetype=None, mimetypes=None):
|
||||
if mimetypes:
|
||||
for mimetype in mimetypes:
|
||||
mimetype_registry[mimetype] = {'function': function}
|
||||
else:
|
||||
mimetype_registry[mimetype] = {'function': function}
|
||||
|
||||
|
||||
def pdf_parser(document_page):
|
||||
fd = document_page.document.open()
|
||||
pdf_pages = slate.PDF(fd)
|
||||
fd.close()
|
||||
def pdf_parser(document_page, descriptor=None):
|
||||
if not descriptor:
|
||||
descriptor = document_page.document.open()
|
||||
|
||||
pdf_pages = slate.PDF(descriptor)
|
||||
descriptor.close()
|
||||
|
||||
if pdf_pages[document_page.page_number - 1] == '\x0c':
|
||||
raise ParserError
|
||||
@@ -24,11 +39,37 @@ def pdf_parser(document_page):
|
||||
document_page.save()
|
||||
|
||||
|
||||
def office_parser(document_page):
|
||||
logger.debug('executing')
|
||||
try:
|
||||
office_converter = OfficeConverter()
|
||||
document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
|
||||
logger.debug('document_file: %s', document_file)
|
||||
|
||||
office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
|
||||
if office_converter.exists:
|
||||
input_filepath = office_converter.output_filepath
|
||||
logger.debug('office_converter.output_filepath: %s', input_filepath)
|
||||
|
||||
pdf_parser(document_page, descriptor=open(input_filepath))
|
||||
else:
|
||||
raise ParserError
|
||||
|
||||
except OfficeConversionError, msg:
|
||||
print msg
|
||||
raise ParserError
|
||||
|
||||
|
||||
def parse_document_page(document_page):
|
||||
logger.debug('executing')
|
||||
logger.debug('document_page: %s' % document_page)
|
||||
logger.debug('mimetype: %s' % document_page.document.file_mimetype)
|
||||
|
||||
try:
|
||||
mimetype_registry[document_page.document.file_mimetype]['function'](document_page)
|
||||
except KeyError:
|
||||
raise ParserUnknownFile
|
||||
|
||||
|
||||
register_parser('application/pdf', pdf_parser)
|
||||
register_parser(mimetype=u'application/pdf', function=pdf_parser)
|
||||
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, function=office_parser)
|
||||
|
||||
Reference in New Issue
Block a user