Instead of inserting the path of the apps into the Python app, the apps are now referenced by their full import path. This solves name clashes with external or native Python libraries. Example: Mayan statistics app vs. Python new statistics library. Every app reference is now prepended with 'mayan.apps'. Existing config.yml files need to be updated manually. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
166 lines
5.2 KiB
Python
166 lines
5.2 KiB
Python
from __future__ import unicode_literals
|
|
|
|
import logging
|
|
import os
|
|
import subprocess
|
|
|
|
from django.apps import apps
|
|
from django.utils.translation import ugettext_lazy as _
|
|
|
|
from mayan.apps.common.utils import copyfile, fs_cleanup, mkstemp
|
|
|
|
from .exceptions import ParserError
|
|
from .settings import setting_pdftotext_path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class Parser(object):
|
|
"""
|
|
Parser base class
|
|
"""
|
|
_registry = {}
|
|
|
|
@classmethod
|
|
def register(cls, mimetypes, parser_classes):
|
|
for mimetype in mimetypes:
|
|
for parser_class in parser_classes:
|
|
cls._registry.setdefault(
|
|
mimetype, []
|
|
).append(parser_class)
|
|
|
|
@classmethod
|
|
def parse_document_version(cls, document_version):
|
|
for parser_class in cls._registry.get(document_version.mimetype, ()):
|
|
try:
|
|
parser = parser_class()
|
|
parser.process_document_version(document_version)
|
|
except ParserError:
|
|
# If parser raises error, try next parser in the list
|
|
pass
|
|
else:
|
|
# If parser was successfull there is no need to try
|
|
# others in the list for this mimetype
|
|
return
|
|
|
|
@classmethod
|
|
def parse_document_page(cls, document_page):
|
|
for parser_class in cls._registry.get(document_page.document_version.mimetype, ()):
|
|
try:
|
|
parser = parser_class()
|
|
parser.process_document_page(document_page)
|
|
except ParserError:
|
|
# If parser raises error, try next parser in the list
|
|
pass
|
|
else:
|
|
# If parser was successfull there is no need to try
|
|
# others in the list for this mimetype
|
|
return
|
|
|
|
def process_document_version(self, document_version):
|
|
logger.info(
|
|
'Starting parsing for document version: %s', document_version
|
|
)
|
|
logger.debug('document version: %d', document_version.pk)
|
|
|
|
for document_page in document_version.pages.all():
|
|
self.process_document_page(document_page=document_page)
|
|
|
|
def process_document_page(self, document_page):
|
|
DocumentPageContent = apps.get_model(
|
|
app_label='document_parsing', model_name='DocumentPageContent'
|
|
)
|
|
|
|
logger.info(
|
|
'Processing page: %d of document version: %s',
|
|
document_page.page_number, document_page.document_version
|
|
)
|
|
|
|
file_object = document_page.document_version.get_intermidiate_file()
|
|
|
|
try:
|
|
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
|
document_page=document_page
|
|
)
|
|
document_page_content.content = self.execute(
|
|
file_object=file_object, page_number=document_page.page_number
|
|
)
|
|
document_page_content.save()
|
|
except Exception as exception:
|
|
error_message = _('Exception parsing page; %s') % exception
|
|
logger.error(error_message)
|
|
raise ParserError(error_message)
|
|
finally:
|
|
file_object.close()
|
|
|
|
logger.info(
|
|
'Finished processing page: %d of document version: %s',
|
|
document_page.page_number, document_page.document_version
|
|
)
|
|
|
|
def execute(self, file_object, page_number):
|
|
raise NotImplementedError(
|
|
'Your %s class has not defined the required execute() method.' %
|
|
self.__class__.__name__
|
|
)
|
|
|
|
|
|
class PopplerParser(Parser):
|
|
"""
|
|
PDF parser using the pdftotext execute from the poppler package
|
|
"""
|
|
def __init__(self):
|
|
self.pdftotext_path = setting_pdftotext_path.value
|
|
if not os.path.exists(self.pdftotext_path):
|
|
error_message = _(
|
|
'Cannot find pdftotext executable at: %s'
|
|
) % self.pdftotext_path
|
|
logger.error(error_message)
|
|
raise ParserError(error_message)
|
|
|
|
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
|
|
|
def execute(self, file_object, page_number):
|
|
logger.debug('Parsing PDF page: %d', page_number)
|
|
|
|
destination_descriptor, temp_filepath = mkstemp()
|
|
copyfile(file_object, temp_filepath)
|
|
|
|
command = []
|
|
command.append(self.pdftotext_path)
|
|
command.append('-f')
|
|
command.append(str(page_number))
|
|
command.append('-l')
|
|
command.append(str(page_number))
|
|
command.append(temp_filepath)
|
|
command.append('-')
|
|
|
|
proc = subprocess.Popen(
|
|
command, close_fds=True, stderr=subprocess.PIPE,
|
|
stdout=subprocess.PIPE
|
|
)
|
|
return_code = proc.wait()
|
|
if return_code != 0:
|
|
logger.error(proc.stderr.readline())
|
|
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
|
|
|
raise ParserError
|
|
|
|
output = proc.stdout.read()
|
|
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
|
|
|
if output == b'\x0c':
|
|
logger.debug('Parser didn\'t return any output')
|
|
return ''
|
|
|
|
if output[-3:] == b'\x0a\x0a\x0c':
|
|
return output[:-3]
|
|
|
|
return output
|
|
|
|
|
|
Parser.register(
|
|
mimetypes=('application/pdf',),
|
|
parser_classes=(PopplerParser,)
|
|
)
|