Files
mayan-edms/mayan/apps/document_parsing/parsers.py
Roberto Rosario 03379ab8ec Fix parsing tests
Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
2019-10-10 01:14:46 -04:00

177 lines
5.6 KiB
Python

from __future__ import unicode_literals
import logging
import os
from shutil import copyfileobj
import subprocess
from django.apps import apps
from django.utils.translation import ugettext_lazy as _
from mayan.apps.storage.utils import NamedTemporaryFile
from .exceptions import ParserError
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
class Parser(object):
"""
Parser base class
"""
_registry = {}
@classmethod
def parse_document_version_page(cls, document_version_page):
for parser_class in cls._registry.get(document_version_page.document_version.mimetype, ()):
try:
parser = parser_class()
parser.process_document_page(
document_version_page=document_version_page
)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
@classmethod
def parse_document_version(cls, document_version):
for parser_class in cls._registry.get(document_version.mimetype, ()):
try:
parser = parser_class()
parser.process_document_version(
document_version=document_version
)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
@classmethod
def register(cls, mimetypes, parser_classes):
for mimetype in mimetypes:
for parser_class in parser_classes:
cls._registry.setdefault(
mimetype, []
).append(parser_class)
def process_document_version(self, document_version):
logger.info(
'Starting parsing for document version: %s', document_version
)
logger.debug('document version: %d', document_version.pk)
for document_version_page in document_version.pages.all():
self.process_document_version_page(
document_version_page=document_version_page
)
def process_document_version_page(self, document_version_page):
DocumentVersionPageContent = apps.get_model(
app_label='document_parsing',
model_name='DocumentVersionPageContent'
)
logger.info(
'Processing page: %d of document version: %s',
document_version_page.page_number,
document_version_page.document_version
)
file_object = document_version_page.document_version.get_intermediate_file()
try:
document_version_page_content, created = DocumentVersionPageContent.objects.get_or_create(
document_version_page=document_version_page
)
document_version_page_content.content = self.execute(
file_object=file_object, page_number=document_version_page.page_number
)
document_version_page_content.save()
except Exception as exception:
error_message = _('Exception parsing page; %s') % exception
logger.error(error_message)
raise ParserError(error_message)
finally:
file_object.close()
logger.info(
'Finished processing page: %d of document version: %s',
document_version_page.page_number,
document_version_page.document_version
)
def execute(self, file_object, page_number):
raise NotImplementedError(
'Your %s class has not defined the required execute() method.' %
self.__class__.__name__
)
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = setting_pdftotext_path.value
if not os.path.exists(self.pdftotext_path):
error_message = _(
'Cannot find pdftotext executable at: %s'
) % self.pdftotext_path
logger.error(error_message)
raise ParserError(error_message)
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
temporary_file_object = NamedTemporaryFile()
copyfileobj(fsrc=file_object, fdst=temporary_file_object)
temporary_file_object.seek(0)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(str(page_number))
command.append('-l')
command.append(str(page_number))
command.append(temporary_file_object.name)
command.append('-')
proc = subprocess.Popen(
command, close_fds=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
temporary_file_object.close()
raise ParserError
output = proc.stdout.read()
temporary_file_object.close()
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
if output[-3:] == b'\x0a\x0a\x0c':
return output[:-3]
return output
Parser.register(
mimetypes=('application/pdf',),
parser_classes=(PopplerParser,)
)