Finish the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-24 03:54:07 -04:00
parent e9591c92f9
commit a7eaf6b368
25 changed files with 423 additions and 639 deletions

View File

@@ -1,20 +1,15 @@
from __future__ import unicode_literals
from io import BytesIO
import logging
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import subprocess
from django.apps import apps
from django.utils.translation import ugettext_lazy as _
from common.utils import copyfile, fs_cleanup, mkstemp
from .exceptions import ParserError, NoMIMETypeMatch
from .models import DocumentPageContent
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
@@ -82,6 +77,10 @@ class Parser(object):
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
DocumentPageContent = apps.get_model(
app_label='document_parsing', model_name='DocumentPageContent'
)
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
@@ -171,32 +170,7 @@ class PopplerParser(Parser):
return output
class PDFMinerParser(Parser):
"""
Parser for PDF files using the PDFMiner library for Python
"""
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
with BytesIO() as string_buffer:
rsrcmgr = PDFResourceManager()
device = TextConverter(
rsrcmgr, outfp=string_buffer, laparams=LAParams()
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page = PDFPage.get_pages(
file_object, maxpages=1, pagenos=(page_number - 1,)
)
interpreter.process_page(page.next())
device.close()
logger.debug('Finished parsing PDF: %d', page_number)
return string_buffer.getvalue()
Parser.register(
mimetypes=('application/pdf',),
parser_classes=(PopplerParser, PDFMinerParser)
parser_classes=(PopplerParser,)
)