From e9591c92f935f9aadbaace43273ca5f86c5087a5 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 23 Aug 2017 02:23:14 -0400 Subject: [PATCH] Initial commit of the document parsing app. Signed-off-by: Roberto Rosario --- mayan/apps/document_parsing/__init__.py | 3 + mayan/apps/document_parsing/admin.py | 23 ++ mayan/apps/document_parsing/api_views.py | 97 +++++++++ mayan/apps/document_parsing/apps.py | 125 +++++++++++ mayan/apps/document_parsing/exceptions.py | 22 ++ mayan/apps/document_parsing/forms.py | 104 +++++++++ mayan/apps/document_parsing/handlers.py | 15 ++ mayan/apps/document_parsing/links.py | 27 +++ mayan/apps/document_parsing/managers.py | 14 ++ mayan/apps/document_parsing/models.py | 47 ++++ mayan/apps/document_parsing/parsers.py | 202 ++++++++++++++++++ mayan/apps/document_parsing/permissions.py | 11 + mayan/apps/document_parsing/queues.py | 10 + mayan/apps/document_parsing/serializers.py | 11 + mayan/apps/document_parsing/settings.py | 17 ++ mayan/apps/document_parsing/tests/__init__.py | 0 mayan/apps/document_parsing/tests/test_api.py | 88 ++++++++ .../document_parsing/tests/test_events.py | 41 ++++ .../document_parsing/tests/test_models.py | 77 +++++++ .../document_parsing/tests/test_parsers.py | 83 +++++++ .../apps/document_parsing/tests/test_views.py | 61 ++++++ mayan/apps/document_parsing/urls.py | 65 ++++++ mayan/apps/document_parsing/utils.py | 16 ++ mayan/apps/document_parsing/views.py | 190 ++++++++++++++++ mayan/settings/base.py | 1 + 25 files changed, 1350 insertions(+) create mode 100644 mayan/apps/document_parsing/__init__.py create mode 100644 mayan/apps/document_parsing/admin.py create mode 100644 mayan/apps/document_parsing/api_views.py create mode 100644 mayan/apps/document_parsing/apps.py create mode 100644 mayan/apps/document_parsing/exceptions.py create mode 100644 mayan/apps/document_parsing/forms.py create mode 100644 mayan/apps/document_parsing/handlers.py create mode 100644 mayan/apps/document_parsing/links.py create mode 100644 mayan/apps/document_parsing/managers.py create mode 100644 mayan/apps/document_parsing/models.py create mode 100644 mayan/apps/document_parsing/parsers.py create mode 100644 mayan/apps/document_parsing/permissions.py create mode 100644 mayan/apps/document_parsing/queues.py create mode 100644 mayan/apps/document_parsing/serializers.py create mode 100644 mayan/apps/document_parsing/settings.py create mode 100644 mayan/apps/document_parsing/tests/__init__.py create mode 100644 mayan/apps/document_parsing/tests/test_api.py create mode 100644 mayan/apps/document_parsing/tests/test_events.py create mode 100644 mayan/apps/document_parsing/tests/test_models.py create mode 100644 mayan/apps/document_parsing/tests/test_parsers.py create mode 100644 mayan/apps/document_parsing/tests/test_views.py create mode 100644 mayan/apps/document_parsing/urls.py create mode 100644 mayan/apps/document_parsing/utils.py create mode 100644 mayan/apps/document_parsing/views.py diff --git a/mayan/apps/document_parsing/__init__.py b/mayan/apps/document_parsing/__init__.py new file mode 100644 index 0000000000..79c2287b15 --- /dev/null +++ b/mayan/apps/document_parsing/__init__.py @@ -0,0 +1,3 @@ +from __future__ import unicode_literals + +default_app_config = 'document_parsing.apps.DocumentParsingApp' diff --git a/mayan/apps/document_parsing/admin.py b/mayan/apps/document_parsing/admin.py new file mode 100644 index 0000000000..1bb19bf3ac --- /dev/null +++ b/mayan/apps/document_parsing/admin.py @@ -0,0 +1,23 @@ +from __future__ import unicode_literals + +from django.contrib import admin + +from .models import ( + DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError +) + + +@admin.register(DocumentPageContent) +class DocumentPageContentAdmin(admin.ModelAdmin): + list_display = ('document_page',) + + +@admin.register(DocumentTypeSettings) +class DocumentTypeSettingsAdmin(admin.ModelAdmin): + list_display = ('document_type', 'auto_ocr') + + +@admin.register(DocumentVersionOCRError) +class DocumentVersionOCRErrorAdmin(admin.ModelAdmin): + list_display = ('document_version', 'datetime_submitted') + readonly_fields = ('document_version', 'datetime_submitted', 'result') diff --git a/mayan/apps/document_parsing/api_views.py b/mayan/apps/document_parsing/api_views.py new file mode 100644 index 0000000000..ded56e8ed8 --- /dev/null +++ b/mayan/apps/document_parsing/api_views.py @@ -0,0 +1,97 @@ +from __future__ import absolute_import, unicode_literals + +from rest_framework import generics, status +from rest_framework.response import Response + +from documents.models import Document, DocumentPage, DocumentVersion +from rest_api.permissions import MayanPermission + +from .models import DocumentPageContent +from .permissions import permission_ocr_content_view, permission_ocr_document +from .serializers import DocumentPageContentSerializer + + +class APIDocumentOCRView(generics.GenericAPIView): + mayan_object_permissions = { + 'POST': (permission_ocr_document,) + } + permission_classes = (MayanPermission,) + queryset = Document.objects.all() + + def get_serializer_class(self): + return None + + def post(self, request, *args, **kwargs): + """ + Submit a document for OCR. + --- + omit_serializer: true + parameters: + - name: pk + paramType: path + type: number + responseMessages: + - code: 202 + message: Accepted + """ + + self.get_object().submit_for_ocr() + return Response(status=status.HTTP_202_ACCEPTED) + + +class APIDocumentVersionOCRView(generics.GenericAPIView): + mayan_object_permissions = { + 'POST': (permission_ocr_document,) + } + permission_classes = (MayanPermission,) + queryset = DocumentVersion.objects.all() + + def get_serializer_class(self): + return None + + def post(self, request, *args, **kwargs): + """ + Submit a document version for OCR. + --- + omit_serializer: true + parameters: + - name: pk + paramType: path + type: number + responseMessages: + - code: 202 + message: Accepted + """ + + self.get_object().submit_for_ocr() + return Response(status=status.HTTP_202_ACCEPTED) + + +class APIDocumentPageContentView(generics.RetrieveAPIView): + """ + Returns the OCR content of the selected document page. + --- + GET: + parameters: + - name: pk + paramType: path + type: number + """ + + mayan_object_permissions = { + 'GET': (permission_ocr_content_view,), + } + permission_classes = (MayanPermission,) + serializer_class = DocumentPageContentSerializer + queryset = DocumentPage.objects.all() + + def retrieve(self, request, *args, **kwargs): + instance = self.get_object() + + try: + ocr_content = instance.ocr_content + except DocumentPageContent.DoesNotExist: + ocr_content = DocumentPageContent.objects.none() + + serializer = self.get_serializer(ocr_content) + return Response(serializer.data) diff --git a/mayan/apps/document_parsing/apps.py b/mayan/apps/document_parsing/apps.py new file mode 100644 index 0000000000..6b9a68d499 --- /dev/null +++ b/mayan/apps/document_parsing/apps.py @@ -0,0 +1,125 @@ +from __future__ import unicode_literals + +import logging + +from kombu import Exchange, Queue + +from django.apps import apps +from django.db.models.signals import post_save +from django.utils.translation import ugettext_lazy as _ + +from acls import ModelPermission +from common import ( + MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary, + menu_tools +) +from common.settings import settings_db_sync_task_delay +from documents.search import document_search, document_page_search +from documents.signals import post_version_upload +from documents.widgets import document_link +from mayan.celery import app +from navigation import SourceColumn +from rest_api.classes import APIEndPoint + +from .handlers import handler_parse_document_version +from .links import ( + link_document_content, link_entry_list, link_document_content_errors_list, + link_document_content_download +) +from .permissions import permission_content_view + +logger = logging.getLogger(__name__) + + +class DocumentParsingApp(MayanAppConfig): + has_tests = True + name = 'document_parsing' + verbose_name = _('Document parsing') + + def ready(self): + super(DocumentParsingApp, self).ready() + + APIEndPoint(app=self, version_string='1') + + Document = apps.get_model( + app_label='documents', model_name='Document' + ) + + DocumentType = apps.get_model( + app_label='documents', model_name='DocumentType' + ) + + DocumentVersion = apps.get_model( + app_label='documents', model_name='DocumentVersion' + ) + + DocumentVersionParseError = self.get_model('DocumentVersionParseError') + + ModelPermission.register( + model=Document, permissions=(permission_content_view,) + ) + + SourceColumn( + source=DocumentVersionParseError, label=_('Document'), + func=lambda context: document_link(context['object'].document_version.document) + ) + SourceColumn( + source=DocumentVersionParseError, label=_('Added'), + attribute='datetime_submitted' + ) + SourceColumn( + source=DocumentVersionParseError, label=_('Result'), + attribute='result' + ) + + document_search.add_model_field( + field='versions__pages__content__content', label=_('Content') + ) + + document_page_search.add_model_field( + field='content__content', label=_('Content') + ) + + menu_facet.bind_links( + links=(link_document_content,), sources=(Document,) + ) + menu_multi_item.bind_links( + links=(link_document_submit_multiple,), sources=(Document,) + ) + menu_object.bind_links( + links=(link_document_submit,), sources=(Document,) + ) + menu_object.bind_links( + links=(link_document_type_ocr_settings,), sources=(DocumentType,) + ) + menu_secondary.bind_links( + links=( + link_document_content, link_document_ocr_erros_list, + link_document_ocr_download + ), + sources=( + 'document_parsing:document_content', + 'document_parsing:document_ocr_error_list', + 'document_parsing:document_ocr_download', + ) + ) + menu_secondary.bind_links( + links=(link_entry_list,), + sources=( + 'document_parsing:entry_list', + 'document_parsing:entry_delete_multiple', + 'document_parsing:entry_re_queue_multiple', + DocumentVersionParseError + ) + ) + menu_tools.bind_links( + links=( + link_entry_list + ) + ) + + post_version_upload.connect( + dispatch_uid='document_parsing_handler_parse_document_version', + receiver=handler_parse_document_version, + sender=DocumentVersion + ) diff --git a/mayan/apps/document_parsing/exceptions.py b/mayan/apps/document_parsing/exceptions.py new file mode 100644 index 0000000000..9fc7a9b90a --- /dev/null +++ b/mayan/apps/document_parsing/exceptions.py @@ -0,0 +1,22 @@ +from __future__ import unicode_literals + + +class OCRError(Exception): + """ + Raised by the OCR backend + """ + pass + + +class ParserError(Exception): + """ + Base exception for file parsers + """ + pass + + +class NoMIMETypeMatch(ParserError): + """ + There is no parser registered for the specified MIME type + """ + pass diff --git a/mayan/apps/document_parsing/forms.py b/mayan/apps/document_parsing/forms.py new file mode 100644 index 0000000000..0881a9185a --- /dev/null +++ b/mayan/apps/document_parsing/forms.py @@ -0,0 +1,104 @@ +from __future__ import unicode_literals + +from django import forms +from django.utils.encoding import force_text +from django.utils.html import conditional_escape +from django.utils.safestring import mark_safe +from django.utils.translation import ugettext_lazy as _, ugettext + +from common.widgets import TextAreaDiv +from documents.models import DocumentType + +from .models import DocumentPageContent, DocumentPageOCRContent + + +class DocumentContentForm(forms.Form): + """ + Form that concatenates all of a document pages' text content into a + single textarea widget + """ + def __init__(self, *args, **kwargs): + self.document = kwargs.pop('instance', None) + super(DocumentContentForm, self).__init__(*args, **kwargs) + content = [] + self.fields['contents'].initial = '' + try: + document_pages = self.document.pages.all() + except AttributeError: + document_pages = [] + + for page in document_pages: + try: + page_content = page.ocr_content.content + except DocumentPageContent.DoesNotExist: + pass + else: + content.append(conditional_escape(force_text(page_content))) + content.append( + '\n\n\n
- %s -

\n\n\n' % ( + ugettext( + 'Page %(page_number)d' + ) % {'page_number': page.page_number} + ) + ) + + self.fields['contents'].initial = mark_safe(''.join(content)) + + contents = forms.CharField( + label=_('Contents'), + widget=TextAreaDiv( + attrs={ + 'class': 'text_area_div full-height', + 'data-height-difference': 360 + } + ) + ) + + +class DocumentOCRContentForm(forms.Form): + """ + Form that concatenates all of a document pages' text content into a + single textarea widget + """ + def __init__(self, *args, **kwargs): + self.document = kwargs.pop('instance', None) + super(DocumentContentForm, self).__init__(*args, **kwargs) + content = [] + self.fields['contents'].initial = '' + try: + document_pages = self.document.pages.all() + except AttributeError: + document_pages = [] + + for page in document_pages: + try: + page_content = page.ocr_content.content + except DocumentPageOCRContent.DoesNotExist: + pass + else: + content.append(conditional_escape(force_text(page_content))) + content.append( + '\n\n\n
- %s -

\n\n\n' % ( + ugettext( + 'Page %(page_number)d' + ) % {'page_number': page.page_number} + ) + ) + + self.fields['contents'].initial = mark_safe(''.join(content)) + + contents = forms.CharField( + label=_('Contents'), + widget=TextAreaDiv( + attrs={ + 'class': 'text_area_div full-height', + 'data-height-difference': 360 + } + ) + ) + + +class DocumentTypeSelectForm(forms.Form): + document_type = forms.ModelChoiceField( + queryset=DocumentType.objects.all(), label=('Document type') + ) diff --git a/mayan/apps/document_parsing/handlers.py b/mayan/apps/document_parsing/handlers.py new file mode 100644 index 0000000000..618826246c --- /dev/null +++ b/mayan/apps/document_parsing/handlers.py @@ -0,0 +1,15 @@ +from __future__ import unicode_literals + +import logging + +from django.apps import apps + +from .settings import setting_auto_ocr +from .parsers import Parser + +logger = logging.getLogger(__name__) + + +def handler_parse_document_version(sender, instance, **kwargs): + if kwargs['created']: + Parser.parse_document_version(document_version=instance) diff --git a/mayan/apps/document_parsing/links.py b/mayan/apps/document_parsing/links.py new file mode 100644 index 0000000000..cce30bcad5 --- /dev/null +++ b/mayan/apps/document_parsing/links.py @@ -0,0 +1,27 @@ +from __future__ import unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from navigation import Link + +from .permissions import permission_content_view + +link_document_content = Link( + args='resolved_object.id', icon='fa fa-font', + permissions=(permission_content_view,), text=_('Content'), + view='document_parsing:document_content', +) +link_entry_list = Link( + icon='fa fa-file-text-o', permissions=(permission_ocr_document,), + text=_('Parsing errors'), view='document_parsing:entry_list' +) +link_document_content_errors_list = Link( + args='resolved_object.id', icon='fa fa-file-text-o', + permissions=(permission_ocr_content_view,), text=_('Parsing errors'), + view='document_parsing:document_page_parsing_error_list' +) +link_document_content_download = Link( + args='resolved_object.id', icon='fa fa-file-text-o', + permissions=(permission_ocr_content_view,), text=_('Download content'), + view='document_parsing:document_content_download' +) diff --git a/mayan/apps/document_parsing/managers.py b/mayan/apps/document_parsing/managers.py new file mode 100644 index 0000000000..2e17131486 --- /dev/null +++ b/mayan/apps/document_parsing/managers.py @@ -0,0 +1,14 @@ +from __future__ import unicode_literals + +from datetime import timedelta +import logging + +from django.apps import apps +from django.db import models +from django.utils.timezone import now + +logger = logging.getLogger(__name__) + + +class DocumentPageContentManager(models.Manager): + pass diff --git a/mayan/apps/document_parsing/models.py b/mayan/apps/document_parsing/models.py new file mode 100644 index 0000000000..38dc9ff7f1 --- /dev/null +++ b/mayan/apps/document_parsing/models.py @@ -0,0 +1,47 @@ +from __future__ import unicode_literals + +from django.db import models +from django.utils.encoding import force_text, python_2_unicode_compatible +from django.utils.translation import ugettext_lazy as _ + +from documents.models import DocumentPage, DocumentType, DocumentVersion + +from .managers import DocumentPageContentManager + + +@python_2_unicode_compatible +class DocumentPageContent(models.Model): + document_page = models.OneToOneField( + DocumentPage, on_delete=models.CASCADE, related_name='content', + verbose_name=_('Document page') + ) + content = models.TextField(blank=True, verbose_name=_('Content')) + + objects = DocumentPageContentManager() + + def __str__(self): + return force_text(self.document_page) + + class Meta: + verbose_name = _('Document page content') + verbose_name_plural = _('Document pages contents') + + +@python_2_unicode_compatible +class DocumentVersionParseError(models.Model): + document_version = models.ForeignKey( + DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors', + verbose_name=_('Document version') + ) + datetime_submitted = models.DateTimeField( + auto_add_now=True, db_index=True, verbose_name=_('Date time submitted') + ) + result = models.TextField(blank=True, null=True, verbose_name=_('Result')) + + def __str__(self): + return force_text(self.document_version) + + class Meta: + ordering = ('datetime_submitted',) + verbose_name = _('Document version parse error') + verbose_name_plural = _('Document version parse errors') diff --git a/mayan/apps/document_parsing/parsers.py b/mayan/apps/document_parsing/parsers.py new file mode 100644 index 0000000000..87570afa1f --- /dev/null +++ b/mayan/apps/document_parsing/parsers.py @@ -0,0 +1,202 @@ +from __future__ import unicode_literals + +from io import BytesIO +import logging +import os +from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter +from pdfminer.pdfpage import PDFPage +from pdfminer.converter import TextConverter +from pdfminer.layout import LAParams +import subprocess + +from django.utils.translation import ugettext_lazy as _ + +from common.utils import copyfile, fs_cleanup, mkstemp + +from .exceptions import ParserError, NoMIMETypeMatch +from .models import DocumentPageContent +from .settings import setting_pdftotext_path + +logger = logging.getLogger(__name__) + + +class Parser(object): + """ + Parser base class + """ + + _registry = {} + + @classmethod + def register(cls, mimetypes, parser_classes): + for mimetype in mimetypes: + for parser_class in parser_classes: + cls._registry.setdefault( + mimetype, [] + ).append(parser_class) + + @classmethod + def parse_document_version(cls, document_version): + try: + for parser_class in cls._registry[document_version.mimetype]: + try: + parser = parser_class() + parser.process_document_version(document_version) + except ParserError: + # If parser raises error, try next parser in the list + pass + else: + # If parser was successfull there is no need to try + # others in the list for this mimetype + return + + raise NoMIMETypeMatch('Parser MIME type list exhausted') + except KeyError: + raise NoMIMETypeMatch + + @classmethod + def parse_document_page(cls, document_page): + try: + for parser_class in cls._registry[document_page.document_version.mimetype]: + try: + parser = parser_class() + parser.process_document_page(document_page) + except ParserError: + # If parser raises error, try next parser in the list + pass + else: + # If parser was successfull there is no need to try + # others in the list for this mimetype + return + raise NoMIMETypeMatch('Parser MIME type list exhausted') + except KeyError: + raise NoMIMETypeMatch + + def process_document_version(self, document_version): + logger.info( + 'Starting parsing for document version: %s', document_version + ) + logger.debug('document version: %d', document_version.pk) + + for document_page in document_version.pages.all(): + self.process_document_page(document_page=document_page) + + def process_document_page(self, document_page): + logger.info( + 'Processing page: %d of document version: %s', + document_page.page_number, document_page.document_version + ) + + file_object = document_page.document_version.get_intermidiate_file() + + try: + document_page_content, created = DocumentPageContent.objects.get_or_create( + document_page=document_page + ) + document_page_content.content = self.execute( + file_object=file_object, page_number=document_page.page_number + ) + document_page_content.save() + except Exception as exception: + error_message = _('Exception parsing page; %s') % exception + logger.error(error_message) + raise ParserError(error_message) + finally: + file_object.close() + + logger.info( + 'Finished processing page: %d of document version: %s', + document_page.page_number, document_page.document_version + ) + + def execute(self, file_object, page_number): + raise NotImplementedError( + 'Your %s class has not defined the required execute() method.' % + self.__class__.__name__ + ) + + +class PopplerParser(Parser): + """ + PDF parser using the pdftotext execute from the poppler package + """ + + def __init__(self): + self.pdftotext_path = setting_pdftotext_path.value + if not os.path.exists(self.pdftotext_path): + error_message = _( + 'Cannot find pdftotext executable at: %s' + ) % self.pdftotext_path + logger.error(error_message) + raise ParserError(error_message) + + logger.debug('self.pdftotext_path: %s', self.pdftotext_path) + + def execute(self, file_object, page_number): + logger.debug('Parsing PDF page: %d', page_number) + + destination_descriptor, temp_filepath = mkstemp() + copyfile(file_object, temp_filepath) + + command = [] + command.append(self.pdftotext_path) + command.append('-f') + command.append(str(page_number)) + command.append('-l') + command.append(str(page_number)) + command.append(temp_filepath) + command.append('-') + + proc = subprocess.Popen( + command, close_fds=True, stderr=subprocess.PIPE, + stdout=subprocess.PIPE + ) + return_code = proc.wait() + if return_code != 0: + logger.error(proc.stderr.readline()) + fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) + + raise ParserError + + output = proc.stdout.read() + fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) + + if output == b'\x0c': + logger.debug('Parser didn\'t return any output') + return '' + + if output[-3:] == b'\x0a\x0a\x0c': + return output[:-3] + + return output + + +class PDFMinerParser(Parser): + """ + Parser for PDF files using the PDFMiner library for Python + """ + + def execute(self, file_object, page_number): + logger.debug('Parsing PDF page: %d', page_number) + + with BytesIO() as string_buffer: + rsrcmgr = PDFResourceManager() + device = TextConverter( + rsrcmgr, outfp=string_buffer, laparams=LAParams() + ) + interpreter = PDFPageInterpreter(rsrcmgr, device) + page = PDFPage.get_pages( + file_object, maxpages=1, pagenos=(page_number - 1,) + ) + interpreter.process_page(page.next()) + device.close() + + logger.debug('Finished parsing PDF: %d', page_number) + + return string_buffer.getvalue() + + +Parser.register( + mimetypes=('application/pdf',), + parser_classes=(PopplerParser, PDFMinerParser) +) diff --git a/mayan/apps/document_parsing/permissions.py b/mayan/apps/document_parsing/permissions.py new file mode 100644 index 0000000000..fd003b8f35 --- /dev/null +++ b/mayan/apps/document_parsing/permissions.py @@ -0,0 +1,11 @@ +from __future__ import absolute_import, unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from permissions import PermissionNamespace + +namespace = PermissionNamespace('document_parsing', _('Document parsing')) + +permission_content_view = namespace.add_permission( + name='content_view', label=_('View the content of a document') +) diff --git a/mayan/apps/document_parsing/queues.py b/mayan/apps/document_parsing/queues.py new file mode 100644 index 0000000000..92297a2524 --- /dev/null +++ b/mayan/apps/document_parsing/queues.py @@ -0,0 +1,10 @@ +from __future__ import unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from task_manager.classes import CeleryQueue + +queue_ocr = CeleryQueue(name='ocr', label=_('OCR')) +queue_ocr.add_task_type( + name='ocr.tasks.task_do_ocr', label=_('Document version OCR') +) diff --git a/mayan/apps/document_parsing/serializers.py b/mayan/apps/document_parsing/serializers.py new file mode 100644 index 0000000000..7161d2fc40 --- /dev/null +++ b/mayan/apps/document_parsing/serializers.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +from rest_framework import serializers + +from .models import DocumentPageContent + + +class DocumentPageContentSerializer(serializers.ModelSerializer): + class Meta: + fields = ('content',) + model = DocumentPageContent diff --git a/mayan/apps/document_parsing/settings.py b/mayan/apps/document_parsing/settings.py new file mode 100644 index 0000000000..47caeb44a8 --- /dev/null +++ b/mayan/apps/document_parsing/settings.py @@ -0,0 +1,17 @@ +from __future__ import unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from smart_settings import Namespace + +namespace = Namespace(name='document_parsing', label=_('Document parsing')) + +setting_pdftotext_path = namespace.add_setting( + global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH', + default='/usr/bin/pdftotext', + help_text=_( + 'File path to poppler\'s pdftotext program used to extract text ' + 'from PDF files.' + ), + is_path=True +) diff --git a/mayan/apps/document_parsing/tests/__init__.py b/mayan/apps/document_parsing/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mayan/apps/document_parsing/tests/test_api.py b/mayan/apps/document_parsing/tests/test_api.py new file mode 100644 index 0000000000..fb73bef98d --- /dev/null +++ b/mayan/apps/document_parsing/tests/test_api.py @@ -0,0 +1,88 @@ +from __future__ import unicode_literals + +import json + +from django.contrib.auth import get_user_model +from django.urls import reverse + +from rest_framework import status + +from documents.models import DocumentType +from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH +from rest_api.tests import BaseAPITestCase +from user_management.tests import ( + TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME +) + + +class OCRAPITestCase(BaseAPITestCase): + """ + Test the OCR app API endpoints + """ + + def setUp(self): + super(OCRAPITestCase, self).setUp() + + self.admin_user = get_user_model().objects.create_superuser( + username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL, + password=TEST_ADMIN_PASSWORD + ) + + self.client.login( + username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD + ) + + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) + + with open(TEST_SMALL_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=file_object, + ) + + def tearDown(self): + self.document_type.delete() + super(OCRAPITestCase, self).tearDown() + + def test_submit_document(self): + response = self.client.post( + reverse( + 'rest_api:document-ocr-submit-view', + args=(self.document.pk,) + ) + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + + content = self.document.pages.first().ocr_content.content + + self.assertTrue('Mayan EDMS Documentation' in content) + + def test_submit_document_version(self): + response = self.client.post( + reverse( + 'rest_api:document-version-ocr-submit-view', + args=(self.document.latest_version.pk,) + ) + ) + + self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) + + content = self.document.pages.first().ocr_content.content + + self.assertTrue('Mayan EDMS Documentation' in content) + + def test_get_document_version_page_content(self): + response = self.client.get( + reverse( + 'rest_api:document-page-content-view', + args=(self.document.latest_version.pages.first().pk,) + ), + ) + + self.assertEqual(response.status_code, status.HTTP_200_OK) + + self.assertTrue( + 'Mayan EDMS Documentation' in json.loads(response.content)['content'] + ) diff --git a/mayan/apps/document_parsing/tests/test_events.py b/mayan/apps/document_parsing/tests/test_events.py new file mode 100644 index 0000000000..dc366623d2 --- /dev/null +++ b/mayan/apps/document_parsing/tests/test_events.py @@ -0,0 +1,41 @@ +from __future__ import unicode_literals + +from actstream.models import Action + +from documents.tests.test_models import GenericDocumentTestCase + +from ..events import ( + event_ocr_document_version_submit, event_ocr_document_version_finish +) + + +class OCREventsTestCase(GenericDocumentTestCase): + def test_document_version_submit_event(self): + Action.objects.all().delete() + self.document.submit_for_ocr() + + self.assertEqual( + Action.objects.first().target, self.document.latest_version + ) + self.assertEqual( + Action.objects.first().verb, + event_ocr_document_version_submit.name + ) + + def test_document_version_finish_event(self): + Action.objects.all().delete() + self.document.submit_for_ocr() + from ..models import DocumentVersionOCRError, DocumentPageContent + #print DocumentVersionOCRError.objects.all() + print DocumentPageContent.objects.all() + + for a in Action.objects.all(): + print a + + self.assertEqual( + Action.objects.last().target, self.document.latest_version + ) + self.assertEqual( + Action.objects.last().verb, + event_ocr_document_version_finish.name + ) diff --git a/mayan/apps/document_parsing/tests/test_models.py b/mayan/apps/document_parsing/tests/test_models.py new file mode 100644 index 0000000000..36dbb57f67 --- /dev/null +++ b/mayan/apps/document_parsing/tests/test_models.py @@ -0,0 +1,77 @@ +# -*- coding: utf-8 -*- + +from __future__ import unicode_literals + +from common.tests import BaseTestCase +from documents.models import DocumentType +from documents.settings import setting_language_choices +from documents.tests import ( + TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH +) + + +class DocumentOCRTestCase(BaseTestCase): + # PyOCR's leak descriptor in get_available_languages and image_to_string + # Disable descriptor leak test until fixed in upstream + _skip_file_descriptor_test = True + + def setUp(self): + super(DocumentOCRTestCase, self).setUp() + + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) + + with open(TEST_SMALL_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=file_object, + ) + + def tearDown(self): + self.document.delete() + self.document_type.delete() + super(DocumentOCRTestCase, self).tearDown() + + def test_ocr_language_backends_end(self): + content = self.document.pages.first().ocr_content.content + self.assertTrue('Mayan EDMS Documentation' in content) + + +class GermanOCRSupportTestCase(BaseTestCase): + # PyOCR's leak descriptor in get_available_languages and image_to_string + # Disable descriptor leak test until fixed in upstream + _skip_file_descriptor_test = True + + def setUp(self): + super(GermanOCRSupportTestCase, self).setUp() + + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) + + # Get corresponding language code for German from the default language + # choices list + language_code = [ + language for language in setting_language_choices.value if language[1] == 'German' + ][0][0] + + self.assertEqual('deu', language_code) + + with open(TEST_DEU_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=file_object, language=language_code + ) + + def tearDown(self): + self.document_type.delete() + super(GermanOCRSupportTestCase, self).tearDown() + + def test_ocr_language_backends_end(self): + content = self.document.pages.first().ocr_content.content + + self.assertTrue( + 'Repository für elektronische Dokumente.' in content + ) + self.assertTrue( + 'Es bietet einen' in content + ) diff --git a/mayan/apps/document_parsing/tests/test_parsers.py b/mayan/apps/document_parsing/tests/test_parsers.py new file mode 100644 index 0000000000..9d500a572a --- /dev/null +++ b/mayan/apps/document_parsing/tests/test_parsers.py @@ -0,0 +1,83 @@ +from __future__ import unicode_literals + +from django.core.files.base import File +from django.test import override_settings + +from common.tests import BaseTestCase +from documents.models import DocumentType +from documents.tests import ( + TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH +) + +from ..classes import TextExtractor +from ..parsers import PDFMinerParser, PopplerParser + + +@override_settings(OCR_AUTO_OCR=False) +class ParserTestCase(BaseTestCase): + def setUp(self): + super(ParserTestCase, self).setUp() + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) + + with open(TEST_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=File(file_object) + ) + + def tearDown(self): + self.document_type.delete() + super(ParserTestCase, self).tearDown() + + def test_pdfminer_parser(self): + parser = PDFMinerParser() + + parser.process_document_version(self.document.latest_version) + + self.assertTrue( + 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content + ) + + def test_poppler_parser(self): + parser = PopplerParser() + + parser.process_document_version(self.document.latest_version) + + self.assertTrue( + 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content + ) + + +@override_settings(OCR_AUTO_OCR=False) +class TextExtractorTestCase(BaseTestCase): + def setUp(self): + super(TextExtractorTestCase, self).setUp() + + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) + + with open(TEST_HYBRID_DOCUMENT_PATH) as file_object: + self.document = self.document_type.new_document( + file_object=File(file_object) + ) + + def tearDown(self): + self.document_type.delete() + super(TextExtractorTestCase, self).tearDown() + + def test_text_extractor(self): + TextExtractor.process_document_version( + document_version=self.document.latest_version + ) + + self.assertEqual( + self.document.latest_version.pages.first().ocr_content.content, + 'Sample text', + ) + + self.assertEqual( + self.document.latest_version.pages.last().ocr_content.content, + 'Sample text in image form', + ) diff --git a/mayan/apps/document_parsing/tests/test_views.py b/mayan/apps/document_parsing/tests/test_views.py new file mode 100644 index 0000000000..41b0462103 --- /dev/null +++ b/mayan/apps/document_parsing/tests/test_views.py @@ -0,0 +1,61 @@ +from __future__ import unicode_literals + +from django.test import override_settings + +from documents.tests.test_views import GenericDocumentViewTestCase + +from ..permissions import permission_ocr_content_view +from ..utils import get_document_ocr_content + + +@override_settings(OCR_AUTO_OCR=True) +class OCRViewsTestCase(GenericDocumentViewTestCase): + # PyOCR's leak descriptor in get_available_languages and image_to_string + # Disable descriptor leak test until fixed in upstream + _skip_file_descriptor_test = True + + def setUp(self): + super(OCRViewsTestCase, self).setUp() + self.login_user() + + def _document_content_view(self): + return self.get( + 'ocr:document_content', args=(self.document.pk,) + ) + + def test_document_content_view_no_permissions(self): + response = self._document_content_view() + + self.assertEqual(response.status_code, 403) + + def test_document_content_view_with_permission(self): + self.grant_permission(permission=permission_ocr_content_view) + + response = self._document_content_view() + + self.assertContains( + response, 'Mayan EDMS Documentation', status_code=200 + ) + + def test_document_ocr_download_view_no_permission(self): + response = self.get( + 'ocr:document_ocr_download', args=(self.document.pk,) + ) + + self.assertEqual(response.status_code, 403) + + def test_document_download_view_with_permission(self): + self.expected_content_type = 'application/octet-stream; charset=utf-8' + + self.grant_permission(permission=permission_ocr_content_view) + response = self.get( + 'ocr:document_ocr_download', args=(self.document.pk,) + ) + + self.assertEqual(response.status_code, 200) + + self.assert_download_response( + response, content=( + ''.join(get_document_ocr_content(document=self.document)) + ), + ) diff --git a/mayan/apps/document_parsing/urls.py b/mayan/apps/document_parsing/urls.py new file mode 100644 index 0000000000..30f1b59359 --- /dev/null +++ b/mayan/apps/document_parsing/urls.py @@ -0,0 +1,65 @@ +from __future__ import unicode_literals + +from django.conf.urls import url + +from .api_views import ( + APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView +) +from .views import ( + DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView, + DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView, + DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView +) + +urlpatterns = [ + url( + r'^(?P\d+)/content/$', DocumentOCRContent.as_view(), + name='document_content' + ), + url( + r'^document/(?P\d+)/submit/$', DocumentSubmitView.as_view(), + name='document_submit' + ), + url( + r'^document/all/submit/$', DocumentAllSubmitView.as_view(), + name='document_submit_all' + ), + url( + r'^document/type/submit/$', DocumentTypeSubmitView.as_view(), + name='document_type_submit' + ), + url( + r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(), + name='document_submit_multiple' + ), + url( + r'^document_type/(?P\d+)/ocr/settings/$', + DocumentTypeSettingsEditView.as_view(), + name='document_type_ocr_settings' + ), + url( + r'^documents/(?P\d+)/ocr/errors/$', + DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list' + ), + url( + r'^documents/(?P\d+)/ocr/download/$', + DocumentOCRDownloadView.as_view(), name='document_ocr_download' + ), + url(r'^all/$', EntryListView.as_view(), name='entry_list'), +] + +api_urls = [ + url( + r'^document/(?P\d+)/submit/$', APIDocumentOCRView.as_view(), + name='document-ocr-submit-view' + ), + url( + r'^document_version/(?P\d+)/submit/$', + APIDocumentVersionOCRView.as_view(), + name='document-version-ocr-submit-view' + ), + url( + r'^page/(?P\d+)/content/$', APIDocumentPageContentView.as_view(), + name='document-page-content-view' + ), +] diff --git a/mayan/apps/document_parsing/utils.py b/mayan/apps/document_parsing/utils.py new file mode 100644 index 0000000000..8175c3040e --- /dev/null +++ b/mayan/apps/document_parsing/utils.py @@ -0,0 +1,16 @@ +from __future__ import unicode_literals + +from django.utils.encoding import force_text +from django.utils.html import conditional_escape + +from .models import DocumentPageContent + + +def get_document_ocr_content(document): + for page in document.pages.all(): + try: + page_content = page.ocr_content.content + except DocumentPageContent.DoesNotExist: + pass + else: + yield conditional_escape(force_text(page_content)) diff --git a/mayan/apps/document_parsing/views.py b/mayan/apps/document_parsing/views.py new file mode 100644 index 0000000000..fa0871735b --- /dev/null +++ b/mayan/apps/document_parsing/views.py @@ -0,0 +1,190 @@ +from __future__ import absolute_import, unicode_literals + +from django.contrib import messages +from django.http import HttpResponseRedirect +from django.shortcuts import get_object_or_404 +from django.urls import reverse +from django.utils.translation import ugettext_lazy as _ + +from acls.models import AccessControlList +from common.generics import ( + ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView, + SingleObjectEditView, SingleObjectListView +) +from common.mixins import MultipleInstanceActionMixin +from documents.models import Document, DocumentType + +from .forms import DocumentContentForm, DocumentTypeSelectForm +from .models import DocumentVersionOCRError +from .permissions import ( + permission_ocr_content_view, permission_ocr_document, + permission_document_type_ocr_setup +) +from .utils import get_document_ocr_content + + +class DocumentAllSubmitView(ConfirmView): + extra_context = {'title': _('Submit all documents for OCR?')} + + def get_post_action_redirect(self): + return reverse('common:tools_list') + + def view_action(self): + count = 0 + for document in Document.objects.all(): + document.submit_for_ocr() + count += 1 + + messages.success( + self.request, _('%d documents added to the OCR queue.') % count + ) + + +class DocumentSubmitView(ConfirmView): + def get_extra_context(self): + return { + 'object': self.get_object(), + 'title': _('Submit "%s" to the OCR queue?') % self.get_object() + } + + def get_object(self): + return Document.objects.get(pk=self.kwargs['pk']) + + def object_action(self, instance): + AccessControlList.objects.check_access( + permissions=permission_ocr_document, user=self.request.user, + obj=instance + ) + + instance.submit_for_ocr() + + def view_action(self): + instance = self.get_object() + + self.object_action(instance=instance) + + messages.success( + self.request, + _('Document: %(document)s was added to the OCR queue.') % { + 'document': instance + } + ) + + +class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView): + model = Document + success_message = '%(count)d document submitted to the OCR queue.' + success_message_plural = '%(count)d documents submitted to the OCR queue.' + + def get_extra_context(self): + # Override the base class method + return { + 'title': _('Submit the selected documents to the OCR queue?') + } + + +class DocumentTypeSubmitView(FormView): + form_class = DocumentTypeSelectForm + extra_context = { + 'title': _('Submit all documents of a type for OCR') + } + + def get_post_action_redirect(self): + return reverse('common:tools_list') + + def form_valid(self, form): + count = 0 + for document in form.cleaned_data['document_type'].documents.all(): + document.submit_for_ocr() + count += 1 + + messages.success( + self.request, _( + '%(count)d documents of type "%(document_type)s" added to the ' + 'OCR queue.' + ) % { + 'count': count, + 'document_type': form.cleaned_data['document_type'] + } + ) + + return HttpResponseRedirect(self.get_success_url()) + + +class DocumentTypeSettingsEditView(SingleObjectEditView): + fields = ('auto_ocr',) + view_permission = permission_document_type_ocr_setup + + def get_object(self, queryset=None): + return get_object_or_404( + DocumentType, pk=self.kwargs['pk'] + ).ocr_settings + + def get_extra_context(self): + return { + 'title': _( + 'Edit OCR settings for document type: %s' + ) % self.get_object().document_type + } + + +class DocumentOCRContent(SingleObjectDetailView): + form_class = DocumentContentForm + model = Document + object_permission = permission_ocr_content_view + + def dispatch(self, request, *args, **kwargs): + result = super(DocumentOCRContent, self).dispatch( + request, *args, **kwargs + ) + self.get_object().add_as_recent_document_for_user(request.user) + return result + + def get_extra_context(self): + return { + 'document': self.get_object(), + 'hide_labels': True, + 'object': self.get_object(), + 'title': _('OCR result for document: %s') % self.get_object(), + } + + +class EntryListView(SingleObjectListView): + extra_context = { + 'hide_object': True, + 'title': _('OCR errors'), + } + view_permission = permission_ocr_document + + def get_object_list(self): + return DocumentVersionOCRError.objects.all() + + +class DocumentOCRErrorsListView(SingleObjectListView): + view_permission = permission_ocr_document + + def get_document(self): + return get_object_or_404(Document, pk=self.kwargs['pk']) + + def get_extra_context(self): + return { + 'hide_object': True, + 'object': self.get_document(), + 'title': _('OCR errors for document: %s') % self.get_document(), + } + + def get_object_list(self): + return self.get_document().latest_version.ocr_errors.all() + + +class DocumentOCRDownloadView(SingleObjectDownloadView): + model = Document + object_permission = permission_ocr_content_view + + def get_file(self): + file_object = DocumentOCRDownloadView.TextIteratorIO( + iterator=get_document_ocr_content(document=self.get_object()) + ) + return DocumentOCRDownloadView.VirtualFile( + file=file_object, name='{}-OCR'.format(self.get_object()) + ) diff --git a/mayan/settings/base.py b/mayan/settings/base.py index 349a23fa0a..a2ef68b050 100644 --- a/mayan/settings/base.py +++ b/mayan/settings/base.py @@ -84,6 +84,7 @@ INSTALLED_APPS = ( 'checkouts', 'document_comments', 'document_indexing', + 'document_parsing', 'document_signatures', 'document_states', 'documents',