Files
mayan-edms/mayan/apps/document_parsing/parsers.py
Roberto Rosario 0699ad0556 Add support for new document page structure
Documents now have their own dedicated DocumentPage
submodel. The old DocumentPage is now called DocumentVersionPage.
This allows mappings between document pages and document version
pages, allowing renumbering, appending pages.
DocumentPages have a content_object to map them to any other
object. For now they only map to DocumentVersionPages.
New option added to the version upload form to append the
pages of the new version.
A new view was added to just append new pages with wraps the
new document version upload form and hides the append pages
checkbox set to True.
Add a new action, reset_pages to reset the pages of the
document to those of the latest version.

Missing: appending tests, checks for proper content_object in OCR and
document parsing.

Author: Roberto Rosario <roberto.rosario@mayan-edms.com>
Date:   Thu Oct 11 12:00:25 2019 -0400
2019-10-10 11:55:42 -04:00

177 lines
5.6 KiB
Python

from __future__ import unicode_literals
import logging
import os
from shutil import copyfileobj
import subprocess
from django.apps import apps
from django.utils.translation import ugettext_lazy as _
from mayan.apps.storage.utils import NamedTemporaryFile
from .exceptions import ParserError
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
class Parser(object):
"""
Parser base class
"""
_registry = {}
@classmethod
def parse_document_version_page(cls, document_version_page):
for parser_class in cls._registry.get(document_version_page.document_version.mimetype, ()):
try:
parser = parser_class()
parser.process_document_page(
document_version_page=document_version_page
)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
@classmethod
def parse_document_version(cls, document_version):
for parser_class in cls._registry.get(document_version.mimetype, ()):
try:
parser = parser_class()
parser.process_document_version(
document_version=document_version
)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
@classmethod
def register(cls, mimetypes, parser_classes):
for mimetype in mimetypes:
for parser_class in parser_classes:
cls._registry.setdefault(
mimetype, []
).append(parser_class)
def process_document_version(self, document_version):
logger.info(
'Starting parsing for document version: %s', document_version
)
logger.debug('document version: %d', document_version.pk)
for document_version_page in document_version.pages.all():
self.process_document_version_page(
document_version_page=document_version_page
)
def process_document_version_page(self, document_version_page):
DocumentVersionPageContent = apps.get_model(
app_label='document_parsing',
model_name='DocumentVersionPageContent'
)
logger.info(
'Processing page: %d of document version: %s',
document_version_page.page_number,
document_version_page.document_version
)
file_object = document_version_page.document_version.get_intermediate_file()
try:
document_version_page_content, created = DocumentVersionPageContent.objects.get_or_create(
document_version_page=document_version_page
)
document_version_page_content.content = self.execute(
file_object=file_object, page_number=document_version_page.page_number
)
document_version_page_content.save()
except Exception as exception:
error_message = _('Exception parsing page; %s') % exception
logger.error(error_message)
raise ParserError(error_message)
finally:
file_object.close()
logger.info(
'Finished processing page: %d of document version: %s',
document_version_page.page_number,
document_version_page.document_version
)
def execute(self, file_object, page_number):
raise NotImplementedError(
'Your %s class has not defined the required execute() method.' %
self.__class__.__name__
)
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = setting_pdftotext_path.value
if not os.path.exists(self.pdftotext_path):
error_message = _(
'Cannot find pdftotext executable at: %s'
) % self.pdftotext_path
logger.error(error_message)
raise ParserError(error_message)
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
temporary_file_object = NamedTemporaryFile()
copyfileobj(fsrc=file_object, fdst=temporary_file_object)
temporary_file_object.seek(0)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(str(page_number))
command.append('-l')
command.append(str(page_number))
command.append(temporary_file_object.name)
command.append('-')
proc = subprocess.Popen(
command, close_fds=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
temporary_file_object.close()
raise ParserError
output = proc.stdout.read()
temporary_file_object.close()
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
if output[-3:] == b'\x0a\x0a\x0c':
return output[:-3]
return output
Parser.register(
mimetypes=('application/pdf',),
parser_classes=(PopplerParser,)
)