Finish the document parsing app.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -1,20 +1,15 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import os
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
import subprocess
|
||||
|
||||
from django.apps import apps
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import copyfile, fs_cleanup, mkstemp
|
||||
|
||||
from .exceptions import ParserError, NoMIMETypeMatch
|
||||
from .models import DocumentPageContent
|
||||
from .settings import setting_pdftotext_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -82,6 +77,10 @@ class Parser(object):
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
DocumentPageContent = apps.get_model(
|
||||
app_label='document_parsing', model_name='DocumentPageContent'
|
||||
)
|
||||
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
@@ -171,32 +170,7 @@ class PopplerParser(Parser):
|
||||
return output
|
||||
|
||||
|
||||
class PDFMinerParser(Parser):
|
||||
"""
|
||||
Parser for PDF files using the PDFMiner library for Python
|
||||
"""
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
with BytesIO() as string_buffer:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = TextConverter(
|
||||
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
||||
)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
page = PDFPage.get_pages(
|
||||
file_object, maxpages=1, pagenos=(page_number - 1,)
|
||||
)
|
||||
interpreter.process_page(page.next())
|
||||
device.close()
|
||||
|
||||
logger.debug('Finished parsing PDF: %d', page_number)
|
||||
|
||||
return string_buffer.getvalue()
|
||||
|
||||
|
||||
Parser.register(
|
||||
mimetypes=('application/pdf',),
|
||||
parser_classes=(PopplerParser, PDFMinerParser)
|
||||
parser_classes=(PopplerParser,)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user