diff --git a/mayan/apps/document_parsing/__init__.py b/mayan/apps/document_parsing/__init__.py
new file mode 100644
index 0000000000..79c2287b15
--- /dev/null
+++ b/mayan/apps/document_parsing/__init__.py
@@ -0,0 +1,3 @@
+from __future__ import unicode_literals
+
+default_app_config = 'document_parsing.apps.DocumentParsingApp'
diff --git a/mayan/apps/document_parsing/admin.py b/mayan/apps/document_parsing/admin.py
new file mode 100644
index 0000000000..1bb19bf3ac
--- /dev/null
+++ b/mayan/apps/document_parsing/admin.py
@@ -0,0 +1,23 @@
+from __future__ import unicode_literals
+
+from django.contrib import admin
+
+from .models import (
+ DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
+)
+
+
+@admin.register(DocumentPageContent)
+class DocumentPageContentAdmin(admin.ModelAdmin):
+ list_display = ('document_page',)
+
+
+@admin.register(DocumentTypeSettings)
+class DocumentTypeSettingsAdmin(admin.ModelAdmin):
+ list_display = ('document_type', 'auto_ocr')
+
+
+@admin.register(DocumentVersionOCRError)
+class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
+ list_display = ('document_version', 'datetime_submitted')
+ readonly_fields = ('document_version', 'datetime_submitted', 'result')
diff --git a/mayan/apps/document_parsing/api_views.py b/mayan/apps/document_parsing/api_views.py
new file mode 100644
index 0000000000..ded56e8ed8
--- /dev/null
+++ b/mayan/apps/document_parsing/api_views.py
@@ -0,0 +1,97 @@
+from __future__ import absolute_import, unicode_literals
+
+from rest_framework import generics, status
+from rest_framework.response import Response
+
+from documents.models import Document, DocumentPage, DocumentVersion
+from rest_api.permissions import MayanPermission
+
+from .models import DocumentPageContent
+from .permissions import permission_ocr_content_view, permission_ocr_document
+from .serializers import DocumentPageContentSerializer
+
+
+class APIDocumentOCRView(generics.GenericAPIView):
+ mayan_object_permissions = {
+ 'POST': (permission_ocr_document,)
+ }
+ permission_classes = (MayanPermission,)
+ queryset = Document.objects.all()
+
+ def get_serializer_class(self):
+ return None
+
+ def post(self, request, *args, **kwargs):
+ """
+ Submit a document for OCR.
+ ---
+ omit_serializer: true
+ parameters:
+ - name: pk
+ paramType: path
+ type: number
+ responseMessages:
+ - code: 202
+ message: Accepted
+ """
+
+ self.get_object().submit_for_ocr()
+ return Response(status=status.HTTP_202_ACCEPTED)
+
+
+class APIDocumentVersionOCRView(generics.GenericAPIView):
+ mayan_object_permissions = {
+ 'POST': (permission_ocr_document,)
+ }
+ permission_classes = (MayanPermission,)
+ queryset = DocumentVersion.objects.all()
+
+ def get_serializer_class(self):
+ return None
+
+ def post(self, request, *args, **kwargs):
+ """
+ Submit a document version for OCR.
+ ---
+ omit_serializer: true
+ parameters:
+ - name: pk
+ paramType: path
+ type: number
+ responseMessages:
+ - code: 202
+ message: Accepted
+ """
+
+ self.get_object().submit_for_ocr()
+ return Response(status=status.HTTP_202_ACCEPTED)
+
+
+class APIDocumentPageContentView(generics.RetrieveAPIView):
+ """
+ Returns the OCR content of the selected document page.
+ ---
+ GET:
+ parameters:
+ - name: pk
+ paramType: path
+ type: number
+ """
+
+ mayan_object_permissions = {
+ 'GET': (permission_ocr_content_view,),
+ }
+ permission_classes = (MayanPermission,)
+ serializer_class = DocumentPageContentSerializer
+ queryset = DocumentPage.objects.all()
+
+ def retrieve(self, request, *args, **kwargs):
+ instance = self.get_object()
+
+ try:
+ ocr_content = instance.ocr_content
+ except DocumentPageContent.DoesNotExist:
+ ocr_content = DocumentPageContent.objects.none()
+
+ serializer = self.get_serializer(ocr_content)
+ return Response(serializer.data)
diff --git a/mayan/apps/document_parsing/apps.py b/mayan/apps/document_parsing/apps.py
new file mode 100644
index 0000000000..6b9a68d499
--- /dev/null
+++ b/mayan/apps/document_parsing/apps.py
@@ -0,0 +1,125 @@
+from __future__ import unicode_literals
+
+import logging
+
+from kombu import Exchange, Queue
+
+from django.apps import apps
+from django.db.models.signals import post_save
+from django.utils.translation import ugettext_lazy as _
+
+from acls import ModelPermission
+from common import (
+ MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
+ menu_tools
+)
+from common.settings import settings_db_sync_task_delay
+from documents.search import document_search, document_page_search
+from documents.signals import post_version_upload
+from documents.widgets import document_link
+from mayan.celery import app
+from navigation import SourceColumn
+from rest_api.classes import APIEndPoint
+
+from .handlers import handler_parse_document_version
+from .links import (
+ link_document_content, link_entry_list, link_document_content_errors_list,
+ link_document_content_download
+)
+from .permissions import permission_content_view
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentParsingApp(MayanAppConfig):
+ has_tests = True
+ name = 'document_parsing'
+ verbose_name = _('Document parsing')
+
+ def ready(self):
+ super(DocumentParsingApp, self).ready()
+
+ APIEndPoint(app=self, version_string='1')
+
+ Document = apps.get_model(
+ app_label='documents', model_name='Document'
+ )
+
+ DocumentType = apps.get_model(
+ app_label='documents', model_name='DocumentType'
+ )
+
+ DocumentVersion = apps.get_model(
+ app_label='documents', model_name='DocumentVersion'
+ )
+
+ DocumentVersionParseError = self.get_model('DocumentVersionParseError')
+
+ ModelPermission.register(
+ model=Document, permissions=(permission_content_view,)
+ )
+
+ SourceColumn(
+ source=DocumentVersionParseError, label=_('Document'),
+ func=lambda context: document_link(context['object'].document_version.document)
+ )
+ SourceColumn(
+ source=DocumentVersionParseError, label=_('Added'),
+ attribute='datetime_submitted'
+ )
+ SourceColumn(
+ source=DocumentVersionParseError, label=_('Result'),
+ attribute='result'
+ )
+
+ document_search.add_model_field(
+ field='versions__pages__content__content', label=_('Content')
+ )
+
+ document_page_search.add_model_field(
+ field='content__content', label=_('Content')
+ )
+
+ menu_facet.bind_links(
+ links=(link_document_content,), sources=(Document,)
+ )
+ menu_multi_item.bind_links(
+ links=(link_document_submit_multiple,), sources=(Document,)
+ )
+ menu_object.bind_links(
+ links=(link_document_submit,), sources=(Document,)
+ )
+ menu_object.bind_links(
+ links=(link_document_type_ocr_settings,), sources=(DocumentType,)
+ )
+ menu_secondary.bind_links(
+ links=(
+ link_document_content, link_document_ocr_erros_list,
+ link_document_ocr_download
+ ),
+ sources=(
+ 'document_parsing:document_content',
+ 'document_parsing:document_ocr_error_list',
+ 'document_parsing:document_ocr_download',
+ )
+ )
+ menu_secondary.bind_links(
+ links=(link_entry_list,),
+ sources=(
+ 'document_parsing:entry_list',
+ 'document_parsing:entry_delete_multiple',
+ 'document_parsing:entry_re_queue_multiple',
+ DocumentVersionParseError
+ )
+ )
+ menu_tools.bind_links(
+ links=(
+ link_entry_list
+ )
+ )
+
+ post_version_upload.connect(
+ dispatch_uid='document_parsing_handler_parse_document_version',
+ receiver=handler_parse_document_version,
+ sender=DocumentVersion
+ )
diff --git a/mayan/apps/document_parsing/exceptions.py b/mayan/apps/document_parsing/exceptions.py
new file mode 100644
index 0000000000..9fc7a9b90a
--- /dev/null
+++ b/mayan/apps/document_parsing/exceptions.py
@@ -0,0 +1,22 @@
+from __future__ import unicode_literals
+
+
+class OCRError(Exception):
+ """
+ Raised by the OCR backend
+ """
+ pass
+
+
+class ParserError(Exception):
+ """
+ Base exception for file parsers
+ """
+ pass
+
+
+class NoMIMETypeMatch(ParserError):
+ """
+ There is no parser registered for the specified MIME type
+ """
+ pass
diff --git a/mayan/apps/document_parsing/forms.py b/mayan/apps/document_parsing/forms.py
new file mode 100644
index 0000000000..0881a9185a
--- /dev/null
+++ b/mayan/apps/document_parsing/forms.py
@@ -0,0 +1,104 @@
+from __future__ import unicode_literals
+
+from django import forms
+from django.utils.encoding import force_text
+from django.utils.html import conditional_escape
+from django.utils.safestring import mark_safe
+from django.utils.translation import ugettext_lazy as _, ugettext
+
+from common.widgets import TextAreaDiv
+from documents.models import DocumentType
+
+from .models import DocumentPageContent, DocumentPageOCRContent
+
+
+class DocumentContentForm(forms.Form):
+ """
+ Form that concatenates all of a document pages' text content into a
+ single textarea widget
+ """
+ def __init__(self, *args, **kwargs):
+ self.document = kwargs.pop('instance', None)
+ super(DocumentContentForm, self).__init__(*args, **kwargs)
+ content = []
+ self.fields['contents'].initial = ''
+ try:
+ document_pages = self.document.pages.all()
+ except AttributeError:
+ document_pages = []
+
+ for page in document_pages:
+ try:
+ page_content = page.ocr_content.content
+ except DocumentPageContent.DoesNotExist:
+ pass
+ else:
+ content.append(conditional_escape(force_text(page_content)))
+ content.append(
+ '\n\n\n
- %s -
\n\n\n' % (
+ ugettext(
+ 'Page %(page_number)d'
+ ) % {'page_number': page.page_number}
+ )
+ )
+
+ self.fields['contents'].initial = mark_safe(''.join(content))
+
+ contents = forms.CharField(
+ label=_('Contents'),
+ widget=TextAreaDiv(
+ attrs={
+ 'class': 'text_area_div full-height',
+ 'data-height-difference': 360
+ }
+ )
+ )
+
+
+class DocumentOCRContentForm(forms.Form):
+ """
+ Form that concatenates all of a document pages' text content into a
+ single textarea widget
+ """
+ def __init__(self, *args, **kwargs):
+ self.document = kwargs.pop('instance', None)
+ super(DocumentContentForm, self).__init__(*args, **kwargs)
+ content = []
+ self.fields['contents'].initial = ''
+ try:
+ document_pages = self.document.pages.all()
+ except AttributeError:
+ document_pages = []
+
+ for page in document_pages:
+ try:
+ page_content = page.ocr_content.content
+ except DocumentPageOCRContent.DoesNotExist:
+ pass
+ else:
+ content.append(conditional_escape(force_text(page_content)))
+ content.append(
+ '\n\n\n
- %s -
\n\n\n' % (
+ ugettext(
+ 'Page %(page_number)d'
+ ) % {'page_number': page.page_number}
+ )
+ )
+
+ self.fields['contents'].initial = mark_safe(''.join(content))
+
+ contents = forms.CharField(
+ label=_('Contents'),
+ widget=TextAreaDiv(
+ attrs={
+ 'class': 'text_area_div full-height',
+ 'data-height-difference': 360
+ }
+ )
+ )
+
+
+class DocumentTypeSelectForm(forms.Form):
+ document_type = forms.ModelChoiceField(
+ queryset=DocumentType.objects.all(), label=('Document type')
+ )
diff --git a/mayan/apps/document_parsing/handlers.py b/mayan/apps/document_parsing/handlers.py
new file mode 100644
index 0000000000..618826246c
--- /dev/null
+++ b/mayan/apps/document_parsing/handlers.py
@@ -0,0 +1,15 @@
+from __future__ import unicode_literals
+
+import logging
+
+from django.apps import apps
+
+from .settings import setting_auto_ocr
+from .parsers import Parser
+
+logger = logging.getLogger(__name__)
+
+
+def handler_parse_document_version(sender, instance, **kwargs):
+ if kwargs['created']:
+ Parser.parse_document_version(document_version=instance)
diff --git a/mayan/apps/document_parsing/links.py b/mayan/apps/document_parsing/links.py
new file mode 100644
index 0000000000..cce30bcad5
--- /dev/null
+++ b/mayan/apps/document_parsing/links.py
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from navigation import Link
+
+from .permissions import permission_content_view
+
+link_document_content = Link(
+ args='resolved_object.id', icon='fa fa-font',
+ permissions=(permission_content_view,), text=_('Content'),
+ view='document_parsing:document_content',
+)
+link_entry_list = Link(
+ icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
+ text=_('Parsing errors'), view='document_parsing:entry_list'
+)
+link_document_content_errors_list = Link(
+ args='resolved_object.id', icon='fa fa-file-text-o',
+ permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
+ view='document_parsing:document_page_parsing_error_list'
+)
+link_document_content_download = Link(
+ args='resolved_object.id', icon='fa fa-file-text-o',
+ permissions=(permission_ocr_content_view,), text=_('Download content'),
+ view='document_parsing:document_content_download'
+)
diff --git a/mayan/apps/document_parsing/managers.py b/mayan/apps/document_parsing/managers.py
new file mode 100644
index 0000000000..2e17131486
--- /dev/null
+++ b/mayan/apps/document_parsing/managers.py
@@ -0,0 +1,14 @@
+from __future__ import unicode_literals
+
+from datetime import timedelta
+import logging
+
+from django.apps import apps
+from django.db import models
+from django.utils.timezone import now
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentPageContentManager(models.Manager):
+ pass
diff --git a/mayan/apps/document_parsing/models.py b/mayan/apps/document_parsing/models.py
new file mode 100644
index 0000000000..38dc9ff7f1
--- /dev/null
+++ b/mayan/apps/document_parsing/models.py
@@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from django.db import models
+from django.utils.encoding import force_text, python_2_unicode_compatible
+from django.utils.translation import ugettext_lazy as _
+
+from documents.models import DocumentPage, DocumentType, DocumentVersion
+
+from .managers import DocumentPageContentManager
+
+
+@python_2_unicode_compatible
+class DocumentPageContent(models.Model):
+ document_page = models.OneToOneField(
+ DocumentPage, on_delete=models.CASCADE, related_name='content',
+ verbose_name=_('Document page')
+ )
+ content = models.TextField(blank=True, verbose_name=_('Content'))
+
+ objects = DocumentPageContentManager()
+
+ def __str__(self):
+ return force_text(self.document_page)
+
+ class Meta:
+ verbose_name = _('Document page content')
+ verbose_name_plural = _('Document pages contents')
+
+
+@python_2_unicode_compatible
+class DocumentVersionParseError(models.Model):
+ document_version = models.ForeignKey(
+ DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
+ verbose_name=_('Document version')
+ )
+ datetime_submitted = models.DateTimeField(
+ auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
+ )
+ result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
+
+ def __str__(self):
+ return force_text(self.document_version)
+
+ class Meta:
+ ordering = ('datetime_submitted',)
+ verbose_name = _('Document version parse error')
+ verbose_name_plural = _('Document version parse errors')
diff --git a/mayan/apps/document_parsing/parsers.py b/mayan/apps/document_parsing/parsers.py
new file mode 100644
index 0000000000..87570afa1f
--- /dev/null
+++ b/mayan/apps/document_parsing/parsers.py
@@ -0,0 +1,202 @@
+from __future__ import unicode_literals
+
+from io import BytesIO
+import logging
+import os
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfpage import PDFPage
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+import subprocess
+
+from django.utils.translation import ugettext_lazy as _
+
+from common.utils import copyfile, fs_cleanup, mkstemp
+
+from .exceptions import ParserError, NoMIMETypeMatch
+from .models import DocumentPageContent
+from .settings import setting_pdftotext_path
+
+logger = logging.getLogger(__name__)
+
+
+class Parser(object):
+ """
+ Parser base class
+ """
+
+ _registry = {}
+
+ @classmethod
+ def register(cls, mimetypes, parser_classes):
+ for mimetype in mimetypes:
+ for parser_class in parser_classes:
+ cls._registry.setdefault(
+ mimetype, []
+ ).append(parser_class)
+
+ @classmethod
+ def parse_document_version(cls, document_version):
+ try:
+ for parser_class in cls._registry[document_version.mimetype]:
+ try:
+ parser = parser_class()
+ parser.process_document_version(document_version)
+ except ParserError:
+ # If parser raises error, try next parser in the list
+ pass
+ else:
+ # If parser was successfull there is no need to try
+ # others in the list for this mimetype
+ return
+
+ raise NoMIMETypeMatch('Parser MIME type list exhausted')
+ except KeyError:
+ raise NoMIMETypeMatch
+
+ @classmethod
+ def parse_document_page(cls, document_page):
+ try:
+ for parser_class in cls._registry[document_page.document_version.mimetype]:
+ try:
+ parser = parser_class()
+ parser.process_document_page(document_page)
+ except ParserError:
+ # If parser raises error, try next parser in the list
+ pass
+ else:
+ # If parser was successfull there is no need to try
+ # others in the list for this mimetype
+ return
+ raise NoMIMETypeMatch('Parser MIME type list exhausted')
+ except KeyError:
+ raise NoMIMETypeMatch
+
+ def process_document_version(self, document_version):
+ logger.info(
+ 'Starting parsing for document version: %s', document_version
+ )
+ logger.debug('document version: %d', document_version.pk)
+
+ for document_page in document_version.pages.all():
+ self.process_document_page(document_page=document_page)
+
+ def process_document_page(self, document_page):
+ logger.info(
+ 'Processing page: %d of document version: %s',
+ document_page.page_number, document_page.document_version
+ )
+
+ file_object = document_page.document_version.get_intermidiate_file()
+
+ try:
+ document_page_content, created = DocumentPageContent.objects.get_or_create(
+ document_page=document_page
+ )
+ document_page_content.content = self.execute(
+ file_object=file_object, page_number=document_page.page_number
+ )
+ document_page_content.save()
+ except Exception as exception:
+ error_message = _('Exception parsing page; %s') % exception
+ logger.error(error_message)
+ raise ParserError(error_message)
+ finally:
+ file_object.close()
+
+ logger.info(
+ 'Finished processing page: %d of document version: %s',
+ document_page.page_number, document_page.document_version
+ )
+
+ def execute(self, file_object, page_number):
+ raise NotImplementedError(
+ 'Your %s class has not defined the required execute() method.' %
+ self.__class__.__name__
+ )
+
+
+class PopplerParser(Parser):
+ """
+ PDF parser using the pdftotext execute from the poppler package
+ """
+
+ def __init__(self):
+ self.pdftotext_path = setting_pdftotext_path.value
+ if not os.path.exists(self.pdftotext_path):
+ error_message = _(
+ 'Cannot find pdftotext executable at: %s'
+ ) % self.pdftotext_path
+ logger.error(error_message)
+ raise ParserError(error_message)
+
+ logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
+
+ def execute(self, file_object, page_number):
+ logger.debug('Parsing PDF page: %d', page_number)
+
+ destination_descriptor, temp_filepath = mkstemp()
+ copyfile(file_object, temp_filepath)
+
+ command = []
+ command.append(self.pdftotext_path)
+ command.append('-f')
+ command.append(str(page_number))
+ command.append('-l')
+ command.append(str(page_number))
+ command.append(temp_filepath)
+ command.append('-')
+
+ proc = subprocess.Popen(
+ command, close_fds=True, stderr=subprocess.PIPE,
+ stdout=subprocess.PIPE
+ )
+ return_code = proc.wait()
+ if return_code != 0:
+ logger.error(proc.stderr.readline())
+ fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
+
+ raise ParserError
+
+ output = proc.stdout.read()
+ fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
+
+ if output == b'\x0c':
+ logger.debug('Parser didn\'t return any output')
+ return ''
+
+ if output[-3:] == b'\x0a\x0a\x0c':
+ return output[:-3]
+
+ return output
+
+
+class PDFMinerParser(Parser):
+ """
+ Parser for PDF files using the PDFMiner library for Python
+ """
+
+ def execute(self, file_object, page_number):
+ logger.debug('Parsing PDF page: %d', page_number)
+
+ with BytesIO() as string_buffer:
+ rsrcmgr = PDFResourceManager()
+ device = TextConverter(
+ rsrcmgr, outfp=string_buffer, laparams=LAParams()
+ )
+ interpreter = PDFPageInterpreter(rsrcmgr, device)
+ page = PDFPage.get_pages(
+ file_object, maxpages=1, pagenos=(page_number - 1,)
+ )
+ interpreter.process_page(page.next())
+ device.close()
+
+ logger.debug('Finished parsing PDF: %d', page_number)
+
+ return string_buffer.getvalue()
+
+
+Parser.register(
+ mimetypes=('application/pdf',),
+ parser_classes=(PopplerParser, PDFMinerParser)
+)
diff --git a/mayan/apps/document_parsing/permissions.py b/mayan/apps/document_parsing/permissions.py
new file mode 100644
index 0000000000..fd003b8f35
--- /dev/null
+++ b/mayan/apps/document_parsing/permissions.py
@@ -0,0 +1,11 @@
+from __future__ import absolute_import, unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from permissions import PermissionNamespace
+
+namespace = PermissionNamespace('document_parsing', _('Document parsing'))
+
+permission_content_view = namespace.add_permission(
+ name='content_view', label=_('View the content of a document')
+)
diff --git a/mayan/apps/document_parsing/queues.py b/mayan/apps/document_parsing/queues.py
new file mode 100644
index 0000000000..92297a2524
--- /dev/null
+++ b/mayan/apps/document_parsing/queues.py
@@ -0,0 +1,10 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from task_manager.classes import CeleryQueue
+
+queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
+queue_ocr.add_task_type(
+ name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
+)
diff --git a/mayan/apps/document_parsing/serializers.py b/mayan/apps/document_parsing/serializers.py
new file mode 100644
index 0000000000..7161d2fc40
--- /dev/null
+++ b/mayan/apps/document_parsing/serializers.py
@@ -0,0 +1,11 @@
+from __future__ import unicode_literals
+
+from rest_framework import serializers
+
+from .models import DocumentPageContent
+
+
+class DocumentPageContentSerializer(serializers.ModelSerializer):
+ class Meta:
+ fields = ('content',)
+ model = DocumentPageContent
diff --git a/mayan/apps/document_parsing/settings.py b/mayan/apps/document_parsing/settings.py
new file mode 100644
index 0000000000..47caeb44a8
--- /dev/null
+++ b/mayan/apps/document_parsing/settings.py
@@ -0,0 +1,17 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from smart_settings import Namespace
+
+namespace = Namespace(name='document_parsing', label=_('Document parsing'))
+
+setting_pdftotext_path = namespace.add_setting(
+ global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH',
+ default='/usr/bin/pdftotext',
+ help_text=_(
+ 'File path to poppler\'s pdftotext program used to extract text '
+ 'from PDF files.'
+ ),
+ is_path=True
+)
diff --git a/mayan/apps/document_parsing/tests/__init__.py b/mayan/apps/document_parsing/tests/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/mayan/apps/document_parsing/tests/test_api.py b/mayan/apps/document_parsing/tests/test_api.py
new file mode 100644
index 0000000000..fb73bef98d
--- /dev/null
+++ b/mayan/apps/document_parsing/tests/test_api.py
@@ -0,0 +1,88 @@
+from __future__ import unicode_literals
+
+import json
+
+from django.contrib.auth import get_user_model
+from django.urls import reverse
+
+from rest_framework import status
+
+from documents.models import DocumentType
+from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
+from rest_api.tests import BaseAPITestCase
+from user_management.tests import (
+ TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
+)
+
+
+class OCRAPITestCase(BaseAPITestCase):
+ """
+ Test the OCR app API endpoints
+ """
+
+ def setUp(self):
+ super(OCRAPITestCase, self).setUp()
+
+ self.admin_user = get_user_model().objects.create_superuser(
+ username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
+ password=TEST_ADMIN_PASSWORD
+ )
+
+ self.client.login(
+ username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
+ )
+
+ self.document_type = DocumentType.objects.create(
+ label=TEST_DOCUMENT_TYPE_LABEL
+ )
+
+ with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
+ self.document = self.document_type.new_document(
+ file_object=file_object,
+ )
+
+ def tearDown(self):
+ self.document_type.delete()
+ super(OCRAPITestCase, self).tearDown()
+
+ def test_submit_document(self):
+ response = self.client.post(
+ reverse(
+ 'rest_api:document-ocr-submit-view',
+ args=(self.document.pk,)
+ )
+ )
+
+ self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
+
+ content = self.document.pages.first().ocr_content.content
+
+ self.assertTrue('Mayan EDMS Documentation' in content)
+
+ def test_submit_document_version(self):
+ response = self.client.post(
+ reverse(
+ 'rest_api:document-version-ocr-submit-view',
+ args=(self.document.latest_version.pk,)
+ )
+ )
+
+ self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
+
+ content = self.document.pages.first().ocr_content.content
+
+ self.assertTrue('Mayan EDMS Documentation' in content)
+
+ def test_get_document_version_page_content(self):
+ response = self.client.get(
+ reverse(
+ 'rest_api:document-page-content-view',
+ args=(self.document.latest_version.pages.first().pk,)
+ ),
+ )
+
+ self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+ self.assertTrue(
+ 'Mayan EDMS Documentation' in json.loads(response.content)['content']
+ )
diff --git a/mayan/apps/document_parsing/tests/test_events.py b/mayan/apps/document_parsing/tests/test_events.py
new file mode 100644
index 0000000000..dc366623d2
--- /dev/null
+++ b/mayan/apps/document_parsing/tests/test_events.py
@@ -0,0 +1,41 @@
+from __future__ import unicode_literals
+
+from actstream.models import Action
+
+from documents.tests.test_models import GenericDocumentTestCase
+
+from ..events import (
+ event_ocr_document_version_submit, event_ocr_document_version_finish
+)
+
+
+class OCREventsTestCase(GenericDocumentTestCase):
+ def test_document_version_submit_event(self):
+ Action.objects.all().delete()
+ self.document.submit_for_ocr()
+
+ self.assertEqual(
+ Action.objects.first().target, self.document.latest_version
+ )
+ self.assertEqual(
+ Action.objects.first().verb,
+ event_ocr_document_version_submit.name
+ )
+
+ def test_document_version_finish_event(self):
+ Action.objects.all().delete()
+ self.document.submit_for_ocr()
+ from ..models import DocumentVersionOCRError, DocumentPageContent
+ #print DocumentVersionOCRError.objects.all()
+ print DocumentPageContent.objects.all()
+
+ for a in Action.objects.all():
+ print a
+
+ self.assertEqual(
+ Action.objects.last().target, self.document.latest_version
+ )
+ self.assertEqual(
+ Action.objects.last().verb,
+ event_ocr_document_version_finish.name
+ )
diff --git a/mayan/apps/document_parsing/tests/test_models.py b/mayan/apps/document_parsing/tests/test_models.py
new file mode 100644
index 0000000000..36dbb57f67
--- /dev/null
+++ b/mayan/apps/document_parsing/tests/test_models.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from common.tests import BaseTestCase
+from documents.models import DocumentType
+from documents.settings import setting_language_choices
+from documents.tests import (
+ TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
+)
+
+
+class DocumentOCRTestCase(BaseTestCase):
+ # PyOCR's leak descriptor in get_available_languages and image_to_string
+ # Disable descriptor leak test until fixed in upstream
+ _skip_file_descriptor_test = True
+
+ def setUp(self):
+ super(DocumentOCRTestCase, self).setUp()
+
+ self.document_type = DocumentType.objects.create(
+ label=TEST_DOCUMENT_TYPE_LABEL
+ )
+
+ with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
+ self.document = self.document_type.new_document(
+ file_object=file_object,
+ )
+
+ def tearDown(self):
+ self.document.delete()
+ self.document_type.delete()
+ super(DocumentOCRTestCase, self).tearDown()
+
+ def test_ocr_language_backends_end(self):
+ content = self.document.pages.first().ocr_content.content
+ self.assertTrue('Mayan EDMS Documentation' in content)
+
+
+class GermanOCRSupportTestCase(BaseTestCase):
+ # PyOCR's leak descriptor in get_available_languages and image_to_string
+ # Disable descriptor leak test until fixed in upstream
+ _skip_file_descriptor_test = True
+
+ def setUp(self):
+ super(GermanOCRSupportTestCase, self).setUp()
+
+ self.document_type = DocumentType.objects.create(
+ label=TEST_DOCUMENT_TYPE_LABEL
+ )
+
+ # Get corresponding language code for German from the default language
+ # choices list
+ language_code = [
+ language for language in setting_language_choices.value if language[1] == 'German'
+ ][0][0]
+
+ self.assertEqual('deu', language_code)
+
+ with open(TEST_DEU_DOCUMENT_PATH) as file_object:
+ self.document = self.document_type.new_document(
+ file_object=file_object, language=language_code
+ )
+
+ def tearDown(self):
+ self.document_type.delete()
+ super(GermanOCRSupportTestCase, self).tearDown()
+
+ def test_ocr_language_backends_end(self):
+ content = self.document.pages.first().ocr_content.content
+
+ self.assertTrue(
+ 'Repository für elektronische Dokumente.' in content
+ )
+ self.assertTrue(
+ 'Es bietet einen' in content
+ )
diff --git a/mayan/apps/document_parsing/tests/test_parsers.py b/mayan/apps/document_parsing/tests/test_parsers.py
new file mode 100644
index 0000000000..9d500a572a
--- /dev/null
+++ b/mayan/apps/document_parsing/tests/test_parsers.py
@@ -0,0 +1,83 @@
+from __future__ import unicode_literals
+
+from django.core.files.base import File
+from django.test import override_settings
+
+from common.tests import BaseTestCase
+from documents.models import DocumentType
+from documents.tests import (
+ TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
+)
+
+from ..classes import TextExtractor
+from ..parsers import PDFMinerParser, PopplerParser
+
+
+@override_settings(OCR_AUTO_OCR=False)
+class ParserTestCase(BaseTestCase):
+ def setUp(self):
+ super(ParserTestCase, self).setUp()
+ self.document_type = DocumentType.objects.create(
+ label=TEST_DOCUMENT_TYPE_LABEL
+ )
+
+ with open(TEST_DOCUMENT_PATH) as file_object:
+ self.document = self.document_type.new_document(
+ file_object=File(file_object)
+ )
+
+ def tearDown(self):
+ self.document_type.delete()
+ super(ParserTestCase, self).tearDown()
+
+ def test_pdfminer_parser(self):
+ parser = PDFMinerParser()
+
+ parser.process_document_version(self.document.latest_version)
+
+ self.assertTrue(
+ 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+ )
+
+ def test_poppler_parser(self):
+ parser = PopplerParser()
+
+ parser.process_document_version(self.document.latest_version)
+
+ self.assertTrue(
+ 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+ )
+
+
+@override_settings(OCR_AUTO_OCR=False)
+class TextExtractorTestCase(BaseTestCase):
+ def setUp(self):
+ super(TextExtractorTestCase, self).setUp()
+
+ self.document_type = DocumentType.objects.create(
+ label=TEST_DOCUMENT_TYPE_LABEL
+ )
+
+ with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
+ self.document = self.document_type.new_document(
+ file_object=File(file_object)
+ )
+
+ def tearDown(self):
+ self.document_type.delete()
+ super(TextExtractorTestCase, self).tearDown()
+
+ def test_text_extractor(self):
+ TextExtractor.process_document_version(
+ document_version=self.document.latest_version
+ )
+
+ self.assertEqual(
+ self.document.latest_version.pages.first().ocr_content.content,
+ 'Sample text',
+ )
+
+ self.assertEqual(
+ self.document.latest_version.pages.last().ocr_content.content,
+ 'Sample text in image form',
+ )
diff --git a/mayan/apps/document_parsing/tests/test_views.py b/mayan/apps/document_parsing/tests/test_views.py
new file mode 100644
index 0000000000..41b0462103
--- /dev/null
+++ b/mayan/apps/document_parsing/tests/test_views.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+from django.test import override_settings
+
+from documents.tests.test_views import GenericDocumentViewTestCase
+
+from ..permissions import permission_ocr_content_view
+from ..utils import get_document_ocr_content
+
+
+@override_settings(OCR_AUTO_OCR=True)
+class OCRViewsTestCase(GenericDocumentViewTestCase):
+ # PyOCR's leak descriptor in get_available_languages and image_to_string
+ # Disable descriptor leak test until fixed in upstream
+ _skip_file_descriptor_test = True
+
+ def setUp(self):
+ super(OCRViewsTestCase, self).setUp()
+ self.login_user()
+
+ def _document_content_view(self):
+ return self.get(
+ 'ocr:document_content', args=(self.document.pk,)
+ )
+
+ def test_document_content_view_no_permissions(self):
+ response = self._document_content_view()
+
+ self.assertEqual(response.status_code, 403)
+
+ def test_document_content_view_with_permission(self):
+ self.grant_permission(permission=permission_ocr_content_view)
+
+ response = self._document_content_view()
+
+ self.assertContains(
+ response, 'Mayan EDMS Documentation', status_code=200
+ )
+
+ def test_document_ocr_download_view_no_permission(self):
+ response = self.get(
+ 'ocr:document_ocr_download', args=(self.document.pk,)
+ )
+
+ self.assertEqual(response.status_code, 403)
+
+ def test_document_download_view_with_permission(self):
+ self.expected_content_type = 'application/octet-stream; charset=utf-8'
+
+ self.grant_permission(permission=permission_ocr_content_view)
+ response = self.get(
+ 'ocr:document_ocr_download', args=(self.document.pk,)
+ )
+
+ self.assertEqual(response.status_code, 200)
+
+ self.assert_download_response(
+ response, content=(
+ ''.join(get_document_ocr_content(document=self.document))
+ ),
+ )
diff --git a/mayan/apps/document_parsing/urls.py b/mayan/apps/document_parsing/urls.py
new file mode 100644
index 0000000000..30f1b59359
--- /dev/null
+++ b/mayan/apps/document_parsing/urls.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from django.conf.urls import url
+
+from .api_views import (
+ APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
+)
+from .views import (
+ DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
+ DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
+ DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
+)
+
+urlpatterns = [
+ url(
+ r'^(?P\d+)/content/$', DocumentOCRContent.as_view(),
+ name='document_content'
+ ),
+ url(
+ r'^document/(?P\d+)/submit/$', DocumentSubmitView.as_view(),
+ name='document_submit'
+ ),
+ url(
+ r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
+ name='document_submit_all'
+ ),
+ url(
+ r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
+ name='document_type_submit'
+ ),
+ url(
+ r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
+ name='document_submit_multiple'
+ ),
+ url(
+ r'^document_type/(?P\d+)/ocr/settings/$',
+ DocumentTypeSettingsEditView.as_view(),
+ name='document_type_ocr_settings'
+ ),
+ url(
+ r'^documents/(?P\d+)/ocr/errors/$',
+ DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
+ ),
+ url(
+ r'^documents/(?P\d+)/ocr/download/$',
+ DocumentOCRDownloadView.as_view(), name='document_ocr_download'
+ ),
+ url(r'^all/$', EntryListView.as_view(), name='entry_list'),
+]
+
+api_urls = [
+ url(
+ r'^document/(?P\d+)/submit/$', APIDocumentOCRView.as_view(),
+ name='document-ocr-submit-view'
+ ),
+ url(
+ r'^document_version/(?P\d+)/submit/$',
+ APIDocumentVersionOCRView.as_view(),
+ name='document-version-ocr-submit-view'
+ ),
+ url(
+ r'^page/(?P\d+)/content/$', APIDocumentPageContentView.as_view(),
+ name='document-page-content-view'
+ ),
+]
diff --git a/mayan/apps/document_parsing/utils.py b/mayan/apps/document_parsing/utils.py
new file mode 100644
index 0000000000..8175c3040e
--- /dev/null
+++ b/mayan/apps/document_parsing/utils.py
@@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+
+from django.utils.encoding import force_text
+from django.utils.html import conditional_escape
+
+from .models import DocumentPageContent
+
+
+def get_document_ocr_content(document):
+ for page in document.pages.all():
+ try:
+ page_content = page.ocr_content.content
+ except DocumentPageContent.DoesNotExist:
+ pass
+ else:
+ yield conditional_escape(force_text(page_content))
diff --git a/mayan/apps/document_parsing/views.py b/mayan/apps/document_parsing/views.py
new file mode 100644
index 0000000000..fa0871735b
--- /dev/null
+++ b/mayan/apps/document_parsing/views.py
@@ -0,0 +1,190 @@
+from __future__ import absolute_import, unicode_literals
+
+from django.contrib import messages
+from django.http import HttpResponseRedirect
+from django.shortcuts import get_object_or_404
+from django.urls import reverse
+from django.utils.translation import ugettext_lazy as _
+
+from acls.models import AccessControlList
+from common.generics import (
+ ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
+ SingleObjectEditView, SingleObjectListView
+)
+from common.mixins import MultipleInstanceActionMixin
+from documents.models import Document, DocumentType
+
+from .forms import DocumentContentForm, DocumentTypeSelectForm
+from .models import DocumentVersionOCRError
+from .permissions import (
+ permission_ocr_content_view, permission_ocr_document,
+ permission_document_type_ocr_setup
+)
+from .utils import get_document_ocr_content
+
+
+class DocumentAllSubmitView(ConfirmView):
+ extra_context = {'title': _('Submit all documents for OCR?')}
+
+ def get_post_action_redirect(self):
+ return reverse('common:tools_list')
+
+ def view_action(self):
+ count = 0
+ for document in Document.objects.all():
+ document.submit_for_ocr()
+ count += 1
+
+ messages.success(
+ self.request, _('%d documents added to the OCR queue.') % count
+ )
+
+
+class DocumentSubmitView(ConfirmView):
+ def get_extra_context(self):
+ return {
+ 'object': self.get_object(),
+ 'title': _('Submit "%s" to the OCR queue?') % self.get_object()
+ }
+
+ def get_object(self):
+ return Document.objects.get(pk=self.kwargs['pk'])
+
+ def object_action(self, instance):
+ AccessControlList.objects.check_access(
+ permissions=permission_ocr_document, user=self.request.user,
+ obj=instance
+ )
+
+ instance.submit_for_ocr()
+
+ def view_action(self):
+ instance = self.get_object()
+
+ self.object_action(instance=instance)
+
+ messages.success(
+ self.request,
+ _('Document: %(document)s was added to the OCR queue.') % {
+ 'document': instance
+ }
+ )
+
+
+class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
+ model = Document
+ success_message = '%(count)d document submitted to the OCR queue.'
+ success_message_plural = '%(count)d documents submitted to the OCR queue.'
+
+ def get_extra_context(self):
+ # Override the base class method
+ return {
+ 'title': _('Submit the selected documents to the OCR queue?')
+ }
+
+
+class DocumentTypeSubmitView(FormView):
+ form_class = DocumentTypeSelectForm
+ extra_context = {
+ 'title': _('Submit all documents of a type for OCR')
+ }
+
+ def get_post_action_redirect(self):
+ return reverse('common:tools_list')
+
+ def form_valid(self, form):
+ count = 0
+ for document in form.cleaned_data['document_type'].documents.all():
+ document.submit_for_ocr()
+ count += 1
+
+ messages.success(
+ self.request, _(
+ '%(count)d documents of type "%(document_type)s" added to the '
+ 'OCR queue.'
+ ) % {
+ 'count': count,
+ 'document_type': form.cleaned_data['document_type']
+ }
+ )
+
+ return HttpResponseRedirect(self.get_success_url())
+
+
+class DocumentTypeSettingsEditView(SingleObjectEditView):
+ fields = ('auto_ocr',)
+ view_permission = permission_document_type_ocr_setup
+
+ def get_object(self, queryset=None):
+ return get_object_or_404(
+ DocumentType, pk=self.kwargs['pk']
+ ).ocr_settings
+
+ def get_extra_context(self):
+ return {
+ 'title': _(
+ 'Edit OCR settings for document type: %s'
+ ) % self.get_object().document_type
+ }
+
+
+class DocumentOCRContent(SingleObjectDetailView):
+ form_class = DocumentContentForm
+ model = Document
+ object_permission = permission_ocr_content_view
+
+ def dispatch(self, request, *args, **kwargs):
+ result = super(DocumentOCRContent, self).dispatch(
+ request, *args, **kwargs
+ )
+ self.get_object().add_as_recent_document_for_user(request.user)
+ return result
+
+ def get_extra_context(self):
+ return {
+ 'document': self.get_object(),
+ 'hide_labels': True,
+ 'object': self.get_object(),
+ 'title': _('OCR result for document: %s') % self.get_object(),
+ }
+
+
+class EntryListView(SingleObjectListView):
+ extra_context = {
+ 'hide_object': True,
+ 'title': _('OCR errors'),
+ }
+ view_permission = permission_ocr_document
+
+ def get_object_list(self):
+ return DocumentVersionOCRError.objects.all()
+
+
+class DocumentOCRErrorsListView(SingleObjectListView):
+ view_permission = permission_ocr_document
+
+ def get_document(self):
+ return get_object_or_404(Document, pk=self.kwargs['pk'])
+
+ def get_extra_context(self):
+ return {
+ 'hide_object': True,
+ 'object': self.get_document(),
+ 'title': _('OCR errors for document: %s') % self.get_document(),
+ }
+
+ def get_object_list(self):
+ return self.get_document().latest_version.ocr_errors.all()
+
+
+class DocumentOCRDownloadView(SingleObjectDownloadView):
+ model = Document
+ object_permission = permission_ocr_content_view
+
+ def get_file(self):
+ file_object = DocumentOCRDownloadView.TextIteratorIO(
+ iterator=get_document_ocr_content(document=self.get_object())
+ )
+ return DocumentOCRDownloadView.VirtualFile(
+ file=file_object, name='{}-OCR'.format(self.get_object())
+ )
diff --git a/mayan/settings/base.py b/mayan/settings/base.py
index 349a23fa0a..a2ef68b050 100644
--- a/mayan/settings/base.py
+++ b/mayan/settings/base.py
@@ -84,6 +84,7 @@ INSTALLED_APPS = (
'checkouts',
'document_comments',
'document_indexing',
+ 'document_parsing',
'document_signatures',
'document_states',
'documents',