Initial commit of the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2017-08-23 02:23:14 -04:00
parent 317d07a355
commit e9591c92f9
25 changed files with 1350 additions and 0 deletions
--- a/mayan/apps/document_parsing/init.py
+++ b/mayan/apps/document_parsing/init.py
@@ -0,0 +1,3 @@
 from __future__ import unicode_literals
 default_app_config = 'document_parsing.apps.DocumentParsingApp'
--- a/mayan/apps/document_parsing/admin.py
+++ b/mayan/apps/document_parsing/admin.py
@@ -0,0 +1,23 @@
 from __future__ import unicode_literals
 from django.contrib import admin
 from .models import (
    DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
 )
@admin.register(DocumentPageContent)
 class DocumentPageContentAdmin(admin.ModelAdmin):
    list_display = ('document_page',)
@admin.register(DocumentTypeSettings)
 class DocumentTypeSettingsAdmin(admin.ModelAdmin):
    list_display = ('document_type', 'auto_ocr')
@admin.register(DocumentVersionOCRError)
 class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
    list_display = ('document_version', 'datetime_submitted')
    readonly_fields = ('document_version', 'datetime_submitted', 'result')
--- a/mayan/apps/document_parsing/api_views.py
+++ b/mayan/apps/document_parsing/api_views.py
@@ -0,0 +1,97 @@
 from __future__ import absolute_import, unicode_literals
 from rest_framework import generics, status
 from rest_framework.response import Response
 from documents.models import Document, DocumentPage, DocumentVersion
 from rest_api.permissions import MayanPermission
 from .models import DocumentPageContent
 from .permissions import permission_ocr_content_view, permission_ocr_document
 from .serializers import DocumentPageContentSerializer
 class APIDocumentOCRView(generics.GenericAPIView):
    mayan_object_permissions = {
        'POST': (permission_ocr_document,)
    }
    permission_classes = (MayanPermission,)
    queryset = Document.objects.all()
    def get_serializer_class(self):
        return None
    def post(self, request, *args, **kwargs):
        """
        Submit a document for OCR.
        ---
        omit_serializer: true
        parameters:
            - name: pk
              paramType: path
              type: number
        responseMessages:
            - code: 202
              message: Accepted
        """
        self.get_object().submit_for_ocr()
        return Response(status=status.HTTP_202_ACCEPTED)
 class APIDocumentVersionOCRView(generics.GenericAPIView):
    mayan_object_permissions = {
        'POST': (permission_ocr_document,)
    }
    permission_classes = (MayanPermission,)
    queryset = DocumentVersion.objects.all()
    def get_serializer_class(self):
        return None
    def post(self, request, *args, **kwargs):
        """
        Submit a document version for OCR.
        ---
        omit_serializer: true
        parameters:
            - name: pk
              paramType: path
              type: number
        responseMessages:
            - code: 202
              message: Accepted
        """
        self.get_object().submit_for_ocr()
        return Response(status=status.HTTP_202_ACCEPTED)
 class APIDocumentPageContentView(generics.RetrieveAPIView):
    """
    Returns the OCR content of the selected document page.
    ---
    GET:
        parameters:
            - name: pk
              paramType: path
              type: number
    """
    mayan_object_permissions = {
        'GET': (permission_ocr_content_view,),
    }
    permission_classes = (MayanPermission,)
    serializer_class = DocumentPageContentSerializer
    queryset = DocumentPage.objects.all()
    def retrieve(self, request, *args, **kwargs):
        instance = self.get_object()
        try:
            ocr_content = instance.ocr_content
        except DocumentPageContent.DoesNotExist:
            ocr_content = DocumentPageContent.objects.none()
        serializer = self.get_serializer(ocr_content)
        return Response(serializer.data)
--- a/mayan/apps/document_parsing/apps.py
+++ b/mayan/apps/document_parsing/apps.py
@@ -0,0 +1,125 @@
 from __future__ import unicode_literals
 import logging
 from kombu import Exchange, Queue
 from django.apps import apps
 from django.db.models.signals import post_save
 from django.utils.translation import ugettext_lazy as _
 from acls import ModelPermission
 from common import (
    MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
    menu_tools
 )
 from common.settings import settings_db_sync_task_delay
 from documents.search import document_search, document_page_search
 from documents.signals import post_version_upload
 from documents.widgets import document_link
 from mayan.celery import app
 from navigation import SourceColumn
 from rest_api.classes import APIEndPoint
 from .handlers import handler_parse_document_version
 from .links import (
    link_document_content, link_entry_list, link_document_content_errors_list,
    link_document_content_download
 )
 from .permissions import permission_content_view
 logger = logging.getLogger(__name__)
 class DocumentParsingApp(MayanAppConfig):
    has_tests = True
    name = 'document_parsing'
    verbose_name = _('Document parsing')
    def ready(self):
        super(DocumentParsingApp, self).ready()
        APIEndPoint(app=self, version_string='1')
        Document = apps.get_model(
            app_label='documents', model_name='Document'
        )
        DocumentType = apps.get_model(
            app_label='documents', model_name='DocumentType'
        )
        DocumentVersion = apps.get_model(
            app_label='documents', model_name='DocumentVersion'
        )
        DocumentVersionParseError = self.get_model('DocumentVersionParseError')
        ModelPermission.register(
            model=Document, permissions=(permission_content_view,)
        )
        SourceColumn(
            source=DocumentVersionParseError, label=_('Document'),
            func=lambda context: document_link(context['object'].document_version.document)
        )
        SourceColumn(
            source=DocumentVersionParseError, label=_('Added'),
            attribute='datetime_submitted'
        )
        SourceColumn(
            source=DocumentVersionParseError, label=_('Result'),
            attribute='result'
        )
        document_search.add_model_field(
            field='versions__pages__content__content', label=_('Content')
        )
        document_page_search.add_model_field(
            field='content__content', label=_('Content')
        )
        menu_facet.bind_links(
            links=(link_document_content,), sources=(Document,)
        )
        menu_multi_item.bind_links(
            links=(link_document_submit_multiple,), sources=(Document,)
        )
        menu_object.bind_links(
            links=(link_document_submit,), sources=(Document,)
        )
        menu_object.bind_links(
            links=(link_document_type_ocr_settings,), sources=(DocumentType,)
        )
        menu_secondary.bind_links(
            links=(
                link_document_content, link_document_ocr_erros_list,
                link_document_ocr_download
            ),
            sources=(
                'document_parsing:document_content',
                'document_parsing:document_ocr_error_list',
                'document_parsing:document_ocr_download',
            )
        )
        menu_secondary.bind_links(
            links=(link_entry_list,),
            sources=(
                'document_parsing:entry_list',
                'document_parsing:entry_delete_multiple',
                'document_parsing:entry_re_queue_multiple',
                DocumentVersionParseError
            )
        )
        menu_tools.bind_links(
            links=(
                link_entry_list
            )
        )
        post_version_upload.connect(
            dispatch_uid='document_parsing_handler_parse_document_version',
            receiver=handler_parse_document_version,
            sender=DocumentVersion
        )
--- a/mayan/apps/document_parsing/exceptions.py
+++ b/mayan/apps/document_parsing/exceptions.py
@@ -0,0 +1,22 @@
 from __future__ import unicode_literals
 class OCRError(Exception):
    """
    Raised by the OCR backend
    """
    pass
 class ParserError(Exception):
    """
    Base exception for file parsers
    """
    pass
 class NoMIMETypeMatch(ParserError):
    """
    There is no parser registered for the specified MIME type
    """
    pass
--- a/mayan/apps/document_parsing/forms.py
+++ b/mayan/apps/document_parsing/forms.py
@@ -0,0 +1,104 @@
 from __future__ import unicode_literals
 from django import forms
 from django.utils.encoding import force_text
 from django.utils.html import conditional_escape
 from django.utils.safestring import mark_safe
 from django.utils.translation import ugettext_lazy as _, ugettext
 from common.widgets import TextAreaDiv
 from documents.models import DocumentType
 from .models import DocumentPageContent, DocumentPageOCRContent
 class DocumentContentForm(forms.Form):
    """
    Form that concatenates all of a document pages' text content into a
    single textarea widget
    """
    def __init__(self, *args, **kwargs):
        self.document = kwargs.pop('instance', None)
        super(DocumentContentForm, self).__init__(*args, **kwargs)
        content = []
        self.fields['contents'].initial = ''
        try:
            document_pages = self.document.pages.all()
        except AttributeError:
            document_pages = []
        for page in document_pages:
            try:
                page_content = page.ocr_content.content
            except DocumentPageContent.DoesNotExist:
                pass
            else:
                content.append(conditional_escape(force_text(page_content)))
                content.append(
                    '\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
                        ugettext(
                            'Page %(page_number)d'
                        ) % {'page_number': page.page_number}
                    )
                )
        self.fields['contents'].initial = mark_safe(''.join(content))
    contents = forms.CharField(
        label=_('Contents'),
        widget=TextAreaDiv(
            attrs={
                'class': 'text_area_div full-height',
                'data-height-difference': 360
            }
        )
    )
 class DocumentOCRContentForm(forms.Form):
    """
    Form that concatenates all of a document pages' text content into a
    single textarea widget
    """
    def __init__(self, *args, **kwargs):
        self.document = kwargs.pop('instance', None)
        super(DocumentContentForm, self).__init__(*args, **kwargs)
        content = []
        self.fields['contents'].initial = ''
        try:
            document_pages = self.document.pages.all()
        except AttributeError:
            document_pages = []
        for page in document_pages:
            try:
                page_content = page.ocr_content.content
            except DocumentPageOCRContent.DoesNotExist:
                pass
            else:
                content.append(conditional_escape(force_text(page_content)))
                content.append(
                    '\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
                        ugettext(
                            'Page %(page_number)d'
                        ) % {'page_number': page.page_number}
                    )
                )
        self.fields['contents'].initial = mark_safe(''.join(content))
    contents = forms.CharField(
        label=_('Contents'),
        widget=TextAreaDiv(
            attrs={
                'class': 'text_area_div full-height',
                'data-height-difference': 360
            }
        )
    )
 class DocumentTypeSelectForm(forms.Form):
    document_type = forms.ModelChoiceField(
        queryset=DocumentType.objects.all(), label=('Document type')
    )
--- a/mayan/apps/document_parsing/handlers.py
+++ b/mayan/apps/document_parsing/handlers.py
@@ -0,0 +1,15 @@
 from __future__ import unicode_literals
 import logging
 from django.apps import apps
 from .settings import setting_auto_ocr
 from .parsers import Parser
 logger = logging.getLogger(__name__)
 def handler_parse_document_version(sender, instance, **kwargs):
    if kwargs['created']:
        Parser.parse_document_version(document_version=instance)
--- a/mayan/apps/document_parsing/links.py
+++ b/mayan/apps/document_parsing/links.py
@@ -0,0 +1,27 @@
 from __future__ import unicode_literals
 from django.utils.translation import ugettext_lazy as _
 from navigation import Link
 from .permissions import permission_content_view
 link_document_content = Link(
    args='resolved_object.id', icon='fa fa-font',
    permissions=(permission_content_view,), text=_('Content'),
    view='document_parsing:document_content',
 )
 link_entry_list = Link(
    icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
    text=_('Parsing errors'), view='document_parsing:entry_list'
 )
 link_document_content_errors_list = Link(
    args='resolved_object.id', icon='fa fa-file-text-o',
    permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
    view='document_parsing:document_page_parsing_error_list'
 )
 link_document_content_download = Link(
    args='resolved_object.id', icon='fa fa-file-text-o',
    permissions=(permission_ocr_content_view,), text=_('Download content'),
    view='document_parsing:document_content_download'
 )
--- a/mayan/apps/document_parsing/managers.py
+++ b/mayan/apps/document_parsing/managers.py
@@ -0,0 +1,14 @@
 from __future__ import unicode_literals
 from datetime import timedelta
 import logging
 from django.apps import apps
 from django.db import models
 from django.utils.timezone import now
 logger = logging.getLogger(__name__)
 class DocumentPageContentManager(models.Manager):
    pass
--- a/mayan/apps/document_parsing/models.py
+++ b/mayan/apps/document_parsing/models.py
@@ -0,0 +1,47 @@
 from __future__ import unicode_literals
 from django.db import models
 from django.utils.encoding import force_text, python_2_unicode_compatible
 from django.utils.translation import ugettext_lazy as _
 from documents.models import DocumentPage, DocumentType, DocumentVersion
 from .managers import DocumentPageContentManager
@python_2_unicode_compatible
 class DocumentPageContent(models.Model):
    document_page = models.OneToOneField(
        DocumentPage, on_delete=models.CASCADE, related_name='content',
        verbose_name=_('Document page')
    )
    content = models.TextField(blank=True, verbose_name=_('Content'))
    objects = DocumentPageContentManager()
    def __str__(self):
        return force_text(self.document_page)
    class Meta:
        verbose_name = _('Document page content')
        verbose_name_plural = _('Document pages contents')
@python_2_unicode_compatible
 class DocumentVersionParseError(models.Model):
    document_version = models.ForeignKey(
        DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
        verbose_name=_('Document version')
    )
    datetime_submitted = models.DateTimeField(
        auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
    )
    result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
    def __str__(self):
        return force_text(self.document_version)
    class Meta:
        ordering = ('datetime_submitted',)
        verbose_name = _('Document version parse error')
        verbose_name_plural = _('Document version parse errors')
--- a/mayan/apps/document_parsing/parsers.py
+++ b/mayan/apps/document_parsing/parsers.py
@@ -0,0 +1,202 @@
 from __future__ import unicode_literals
 from io import BytesIO
 import logging
 import os
 from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
 from pdfminer.pdfpage import PDFPage
 from pdfminer.converter import TextConverter
 from pdfminer.layout import LAParams
 import subprocess
 from django.utils.translation import ugettext_lazy as _
 from common.utils import copyfile, fs_cleanup, mkstemp
 from .exceptions import ParserError, NoMIMETypeMatch
 from .models import DocumentPageContent
 from .settings import setting_pdftotext_path
 logger = logging.getLogger(__name__)
 class Parser(object):
    """
    Parser base class
    """
    _registry = {}
    @classmethod
    def register(cls, mimetypes, parser_classes):
        for mimetype in mimetypes:
            for parser_class in parser_classes:
                cls._registry.setdefault(
                    mimetype, []
                ).append(parser_class)
    @classmethod
    def parse_document_version(cls, document_version):
        try:
            for parser_class in cls._registry[document_version.mimetype]:
                try:
                    parser = parser_class()
                    parser.process_document_version(document_version)
                except ParserError:
                    # If parser raises error, try next parser in the list
                    pass
                else:
                    # If parser was successfull there is no need to try
                    # others in the list for this mimetype
                    return
            raise NoMIMETypeMatch('Parser MIME type list exhausted')
        except KeyError:
            raise NoMIMETypeMatch
    @classmethod
    def parse_document_page(cls, document_page):
        try:
            for parser_class in cls._registry[document_page.document_version.mimetype]:
                try:
                    parser = parser_class()
                    parser.process_document_page(document_page)
                except ParserError:
                    # If parser raises error, try next parser in the list
                    pass
                else:
                    # If parser was successfull there is no need to try
                    # others in the list for this mimetype
                    return
            raise NoMIMETypeMatch('Parser MIME type list exhausted')
        except KeyError:
            raise NoMIMETypeMatch
    def process_document_version(self, document_version):
        logger.info(
            'Starting parsing for document version: %s', document_version
        )
        logger.debug('document version: %d', document_version.pk)
        for document_page in document_version.pages.all():
            self.process_document_page(document_page=document_page)
    def process_document_page(self, document_page):
        logger.info(
            'Processing page: %d of document version: %s',
            document_page.page_number, document_page.document_version
        )
        file_object = document_page.document_version.get_intermidiate_file()
        try:
            document_page_content, created = DocumentPageContent.objects.get_or_create(
                document_page=document_page
            )
            document_page_content.content = self.execute(
                file_object=file_object, page_number=document_page.page_number
            )
            document_page_content.save()
        except Exception as exception:
            error_message = _('Exception parsing page; %s') % exception
            logger.error(error_message)
            raise ParserError(error_message)
        finally:
            file_object.close()
        logger.info(
            'Finished processing page: %d of document version: %s',
            document_page.page_number, document_page.document_version
        )
    def execute(self, file_object, page_number):
        raise NotImplementedError(
            'Your %s class has not defined the required execute() method.' %
            self.__class__.__name__
        )
 class PopplerParser(Parser):
    """
    PDF parser using the pdftotext execute from the poppler package
    """
    def __init__(self):
        self.pdftotext_path = setting_pdftotext_path.value
        if not os.path.exists(self.pdftotext_path):
            error_message = _(
                'Cannot find pdftotext executable at: %s'
            ) % self.pdftotext_path
            logger.error(error_message)
            raise ParserError(error_message)
        logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
    def execute(self, file_object, page_number):
        logger.debug('Parsing PDF page: %d', page_number)
        destination_descriptor, temp_filepath = mkstemp()
        copyfile(file_object, temp_filepath)
        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(str(page_number))
        command.append('-l')
        command.append(str(page_number))
        command.append(temp_filepath)
        command.append('-')
        proc = subprocess.Popen(
            command, close_fds=True, stderr=subprocess.PIPE,
            stdout=subprocess.PIPE
        )
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
            raise ParserError
        output = proc.stdout.read()
        fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            return ''
        if output[-3:] == b'\x0a\x0a\x0c':
            return output[:-3]
        return output
 class PDFMinerParser(Parser):
    """
    Parser for PDF files using the PDFMiner library for Python
    """
    def execute(self, file_object, page_number):
        logger.debug('Parsing PDF page: %d', page_number)
        with BytesIO() as string_buffer:
            rsrcmgr = PDFResourceManager()
            device = TextConverter(
                rsrcmgr, outfp=string_buffer, laparams=LAParams()
            )
            interpreter = PDFPageInterpreter(rsrcmgr, device)
            page = PDFPage.get_pages(
                file_object, maxpages=1, pagenos=(page_number - 1,)
            )
            interpreter.process_page(page.next())
            device.close()
            logger.debug('Finished parsing PDF: %d', page_number)
            return string_buffer.getvalue()
 Parser.register(
    mimetypes=('application/pdf',),
    parser_classes=(PopplerParser, PDFMinerParser)
 )
--- a/mayan/apps/document_parsing/permissions.py
+++ b/mayan/apps/document_parsing/permissions.py
@@ -0,0 +1,11 @@
 from __future__ import absolute_import, unicode_literals
 from django.utils.translation import ugettext_lazy as _
 from permissions import PermissionNamespace
 namespace = PermissionNamespace('document_parsing', _('Document parsing'))
 permission_content_view = namespace.add_permission(
    name='content_view', label=_('View the content of a document')
 )
--- a/mayan/apps/document_parsing/queues.py
+++ b/mayan/apps/document_parsing/queues.py
@@ -0,0 +1,10 @@
 from __future__ import unicode_literals
 from django.utils.translation import ugettext_lazy as _
 from task_manager.classes import CeleryQueue
 queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
 queue_ocr.add_task_type(
    name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
 )
--- a/mayan/apps/document_parsing/serializers.py
+++ b/mayan/apps/document_parsing/serializers.py
@@ -0,0 +1,11 @@
 from __future__ import unicode_literals
 from rest_framework import serializers
 from .models import DocumentPageContent
 class DocumentPageContentSerializer(serializers.ModelSerializer):
    class Meta:
        fields = ('content',)
        model = DocumentPageContent
--- a/mayan/apps/document_parsing/settings.py
+++ b/mayan/apps/document_parsing/settings.py
@@ -0,0 +1,17 @@
 from __future__ import unicode_literals
 from django.utils.translation import ugettext_lazy as _
 from smart_settings import Namespace
 namespace = Namespace(name='document_parsing', label=_('Document parsing'))
 setting_pdftotext_path = namespace.add_setting(
    global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH',
    default='/usr/bin/pdftotext',
    help_text=_(
        'File path to poppler\'s pdftotext program used to extract text '
        'from PDF files.'
    ),
    is_path=True
 )
--- a/mayan/apps/document_parsing/tests/init.py
+++ b/mayan/apps/document_parsing/tests/init.py
--- a/mayan/apps/document_parsing/tests/test_api.py
+++ b/mayan/apps/document_parsing/tests/test_api.py
@@ -0,0 +1,88 @@
 from __future__ import unicode_literals
 import json
 from django.contrib.auth import get_user_model
 from django.urls import reverse
 from rest_framework import status
 from documents.models import DocumentType
 from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
 from rest_api.tests import BaseAPITestCase
 from user_management.tests import (
    TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
 )
 class OCRAPITestCase(BaseAPITestCase):
    """
    Test the OCR app API endpoints
    """
    def setUp(self):
        super(OCRAPITestCase, self).setUp()
        self.admin_user = get_user_model().objects.create_superuser(
            username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
            password=TEST_ADMIN_PASSWORD
        )
        self.client.login(
            username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
        )
        self.document_type = DocumentType.objects.create(
            label=TEST_DOCUMENT_TYPE_LABEL
        )
        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
            self.document = self.document_type.new_document(
                file_object=file_object,
            )
    def tearDown(self):
        self.document_type.delete()
        super(OCRAPITestCase, self).tearDown()
    def test_submit_document(self):
        response = self.client.post(
            reverse(
                'rest_api:document-ocr-submit-view',
                args=(self.document.pk,)
            )
        )
        self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
        content = self.document.pages.first().ocr_content.content
        self.assertTrue('Mayan EDMS Documentation' in content)
    def test_submit_document_version(self):
        response = self.client.post(
            reverse(
                'rest_api:document-version-ocr-submit-view',
                args=(self.document.latest_version.pk,)
            )
        )
        self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
        content = self.document.pages.first().ocr_content.content
        self.assertTrue('Mayan EDMS Documentation' in content)
    def test_get_document_version_page_content(self):
        response = self.client.get(
            reverse(
                'rest_api:document-page-content-view',
                args=(self.document.latest_version.pages.first().pk,)
            ),
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertTrue(
            'Mayan EDMS Documentation' in json.loads(response.content)['content']
        )
--- a/mayan/apps/document_parsing/tests/test_events.py
+++ b/mayan/apps/document_parsing/tests/test_events.py
@@ -0,0 +1,41 @@
 from __future__ import unicode_literals
 from actstream.models import Action
 from documents.tests.test_models import GenericDocumentTestCase
 from ..events import (
    event_ocr_document_version_submit, event_ocr_document_version_finish
 )
 class OCREventsTestCase(GenericDocumentTestCase):
    def test_document_version_submit_event(self):
        Action.objects.all().delete()
        self.document.submit_for_ocr()
        self.assertEqual(
            Action.objects.first().target, self.document.latest_version
        )
        self.assertEqual(
            Action.objects.first().verb,
            event_ocr_document_version_submit.name
        )
    def test_document_version_finish_event(self):
        Action.objects.all().delete()
        self.document.submit_for_ocr()
        from ..models import DocumentVersionOCRError, DocumentPageContent
        #print DocumentVersionOCRError.objects.all()
        print DocumentPageContent.objects.all()
        for a in Action.objects.all():
            print a
        self.assertEqual(
            Action.objects.last().target, self.document.latest_version
        )
        self.assertEqual(
            Action.objects.last().verb,
            event_ocr_document_version_finish.name
        )
--- a/mayan/apps/document_parsing/tests/test_models.py
+++ b/mayan/apps/document_parsing/tests/test_models.py
@@ -0,0 +1,77 @@
 # -*- coding: utf-8 -*-
 from __future__ import unicode_literals
 from common.tests import BaseTestCase
 from documents.models import DocumentType
 from documents.settings import setting_language_choices
 from documents.tests import (
    TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
 )
 class DocumentOCRTestCase(BaseTestCase):
    # PyOCR's leak descriptor in get_available_languages and image_to_string
    # Disable descriptor leak test until fixed in upstream
    _skip_file_descriptor_test = True
    def setUp(self):
        super(DocumentOCRTestCase, self).setUp()
        self.document_type = DocumentType.objects.create(
            label=TEST_DOCUMENT_TYPE_LABEL
        )
        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
            self.document = self.document_type.new_document(
                file_object=file_object,
            )
    def tearDown(self):
        self.document.delete()
        self.document_type.delete()
        super(DocumentOCRTestCase, self).tearDown()
    def test_ocr_language_backends_end(self):
        content = self.document.pages.first().ocr_content.content
        self.assertTrue('Mayan EDMS Documentation' in content)
 class GermanOCRSupportTestCase(BaseTestCase):
    # PyOCR's leak descriptor in get_available_languages and image_to_string
    # Disable descriptor leak test until fixed in upstream
    _skip_file_descriptor_test = True
    def setUp(self):
        super(GermanOCRSupportTestCase, self).setUp()
        self.document_type = DocumentType.objects.create(
            label=TEST_DOCUMENT_TYPE_LABEL
        )
        # Get corresponding language code for German from the default language
        # choices list
        language_code = [
            language for language in setting_language_choices.value if language[1] == 'German'
        ][0][0]
        self.assertEqual('deu', language_code)
        with open(TEST_DEU_DOCUMENT_PATH) as file_object:
            self.document = self.document_type.new_document(
                file_object=file_object, language=language_code
            )
    def tearDown(self):
        self.document_type.delete()
        super(GermanOCRSupportTestCase, self).tearDown()
    def test_ocr_language_backends_end(self):
        content = self.document.pages.first().ocr_content.content
        self.assertTrue(
            'Repository für elektronische Dokumente.' in content
        )
        self.assertTrue(
            'Es bietet einen' in content
        )
--- a/mayan/apps/document_parsing/tests/test_parsers.py
+++ b/mayan/apps/document_parsing/tests/test_parsers.py
@@ -0,0 +1,83 @@
 from __future__ import unicode_literals
 from django.core.files.base import File
 from django.test import override_settings
 from common.tests import BaseTestCase
 from documents.models import DocumentType
 from documents.tests import (
    TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
 )
 from ..classes import TextExtractor
 from ..parsers import PDFMinerParser, PopplerParser
@override_settings(OCR_AUTO_OCR=False)
 class ParserTestCase(BaseTestCase):
    def setUp(self):
        super(ParserTestCase, self).setUp()
        self.document_type = DocumentType.objects.create(
            label=TEST_DOCUMENT_TYPE_LABEL
        )
        with open(TEST_DOCUMENT_PATH) as file_object:
            self.document = self.document_type.new_document(
                file_object=File(file_object)
            )
    def tearDown(self):
        self.document_type.delete()
        super(ParserTestCase, self).tearDown()
    def test_pdfminer_parser(self):
        parser = PDFMinerParser()
        parser.process_document_version(self.document.latest_version)
        self.assertTrue(
            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
        )
    def test_poppler_parser(self):
        parser = PopplerParser()
        parser.process_document_version(self.document.latest_version)
        self.assertTrue(
            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
        )
@override_settings(OCR_AUTO_OCR=False)
 class TextExtractorTestCase(BaseTestCase):
    def setUp(self):
        super(TextExtractorTestCase, self).setUp()
        self.document_type = DocumentType.objects.create(
            label=TEST_DOCUMENT_TYPE_LABEL
        )
        with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
            self.document = self.document_type.new_document(
                file_object=File(file_object)
            )
    def tearDown(self):
        self.document_type.delete()
        super(TextExtractorTestCase, self).tearDown()
    def test_text_extractor(self):
        TextExtractor.process_document_version(
            document_version=self.document.latest_version
        )
        self.assertEqual(
            self.document.latest_version.pages.first().ocr_content.content,
            'Sample text',
        )
        self.assertEqual(
            self.document.latest_version.pages.last().ocr_content.content,
            'Sample text in image form',
        )
--- a/mayan/apps/document_parsing/tests/test_views.py
+++ b/mayan/apps/document_parsing/tests/test_views.py
@@ -0,0 +1,61 @@
 from __future__ import unicode_literals
 from django.test import override_settings
 from documents.tests.test_views import GenericDocumentViewTestCase
 from ..permissions import permission_ocr_content_view
 from ..utils import get_document_ocr_content
@override_settings(OCR_AUTO_OCR=True)
 class OCRViewsTestCase(GenericDocumentViewTestCase):
    # PyOCR's leak descriptor in get_available_languages and image_to_string
    # Disable descriptor leak test until fixed in upstream
    _skip_file_descriptor_test = True
    def setUp(self):
        super(OCRViewsTestCase, self).setUp()
        self.login_user()
    def _document_content_view(self):
        return self.get(
            'ocr:document_content', args=(self.document.pk,)
        )
    def test_document_content_view_no_permissions(self):
        response = self._document_content_view()
        self.assertEqual(response.status_code, 403)
    def test_document_content_view_with_permission(self):
        self.grant_permission(permission=permission_ocr_content_view)
        response = self._document_content_view()
        self.assertContains(
            response, 'Mayan EDMS Documentation', status_code=200
        )
    def test_document_ocr_download_view_no_permission(self):
        response = self.get(
            'ocr:document_ocr_download', args=(self.document.pk,)
        )
        self.assertEqual(response.status_code, 403)
    def test_document_download_view_with_permission(self):
        self.expected_content_type = 'application/octet-stream; charset=utf-8'
        self.grant_permission(permission=permission_ocr_content_view)
        response = self.get(
            'ocr:document_ocr_download', args=(self.document.pk,)
        )
        self.assertEqual(response.status_code, 200)
        self.assert_download_response(
            response, content=(
                ''.join(get_document_ocr_content(document=self.document))
            ),
        )
--- a/mayan/apps/document_parsing/urls.py
+++ b/mayan/apps/document_parsing/urls.py
@@ -0,0 +1,65 @@
 from __future__ import unicode_literals
 from django.conf.urls import url
 from .api_views import (
    APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
 )
 from .views import (
    DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
    DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
    DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
 )
 urlpatterns = [
    url(
        r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
        name='document_content'
    ),
    url(
        r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
        name='document_submit'
    ),
    url(
        r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
        name='document_submit_all'
    ),
    url(
        r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
        name='document_type_submit'
    ),
    url(
        r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
        name='document_submit_multiple'
    ),
    url(
        r'^document_type/(?P<pk>\d+)/ocr/settings/$',
        DocumentTypeSettingsEditView.as_view(),
        name='document_type_ocr_settings'
    ),
    url(
        r'^documents/(?P<pk>\d+)/ocr/errors/$',
        DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
    ),
    url(
        r'^documents/(?P<pk>\d+)/ocr/download/$',
        DocumentOCRDownloadView.as_view(), name='document_ocr_download'
    ),
    url(r'^all/$', EntryListView.as_view(), name='entry_list'),
 ]
 api_urls = [
    url(
        r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
        name='document-ocr-submit-view'
    ),
    url(
        r'^document_version/(?P<pk>\d+)/submit/$',
        APIDocumentVersionOCRView.as_view(),
        name='document-version-ocr-submit-view'
    ),
    url(
        r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
        name='document-page-content-view'
    ),
 ]
--- a/mayan/apps/document_parsing/utils.py
+++ b/mayan/apps/document_parsing/utils.py
@@ -0,0 +1,16 @@
 from __future__ import unicode_literals
 from django.utils.encoding import force_text
 from django.utils.html import conditional_escape
 from .models import DocumentPageContent
 def get_document_ocr_content(document):
    for page in document.pages.all():
        try:
            page_content = page.ocr_content.content
        except DocumentPageContent.DoesNotExist:
            pass
        else:
            yield conditional_escape(force_text(page_content))
--- a/mayan/apps/document_parsing/views.py
+++ b/mayan/apps/document_parsing/views.py
@@ -0,0 +1,190 @@
 from __future__ import absolute_import, unicode_literals
 from django.contrib import messages
 from django.http import HttpResponseRedirect
 from django.shortcuts import get_object_or_404
 from django.urls import reverse
 from django.utils.translation import ugettext_lazy as _
 from acls.models import AccessControlList
 from common.generics import (
    ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
    SingleObjectEditView, SingleObjectListView
 )
 from common.mixins import MultipleInstanceActionMixin
 from documents.models import Document, DocumentType
 from .forms import DocumentContentForm, DocumentTypeSelectForm
 from .models import DocumentVersionOCRError
 from .permissions import (
    permission_ocr_content_view, permission_ocr_document,
    permission_document_type_ocr_setup
 )
 from .utils import get_document_ocr_content
 class DocumentAllSubmitView(ConfirmView):
    extra_context = {'title': _('Submit all documents for OCR?')}
    def get_post_action_redirect(self):
        return reverse('common:tools_list')
    def view_action(self):
        count = 0
        for document in Document.objects.all():
            document.submit_for_ocr()
            count += 1
        messages.success(
            self.request, _('%d documents added to the OCR queue.') % count
        )
 class DocumentSubmitView(ConfirmView):
    def get_extra_context(self):
        return {
            'object': self.get_object(),
            'title': _('Submit "%s" to the OCR queue?') % self.get_object()
        }
    def get_object(self):
        return Document.objects.get(pk=self.kwargs['pk'])
    def object_action(self, instance):
        AccessControlList.objects.check_access(
            permissions=permission_ocr_document, user=self.request.user,
            obj=instance
        )
        instance.submit_for_ocr()
    def view_action(self):
        instance = self.get_object()
        self.object_action(instance=instance)
        messages.success(
            self.request,
            _('Document: %(document)s was added to the OCR queue.') % {
                'document': instance
            }
        )
 class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
    model = Document
    success_message = '%(count)d document submitted to the OCR queue.'
    success_message_plural = '%(count)d documents submitted to the OCR queue.'
    def get_extra_context(self):
        # Override the base class method
        return {
            'title': _('Submit the selected documents to the OCR queue?')
        }
 class DocumentTypeSubmitView(FormView):
    form_class = DocumentTypeSelectForm
    extra_context = {
        'title': _('Submit all documents of a type for OCR')
    }
    def get_post_action_redirect(self):
        return reverse('common:tools_list')
    def form_valid(self, form):
        count = 0
        for document in form.cleaned_data['document_type'].documents.all():
            document.submit_for_ocr()
            count += 1
        messages.success(
            self.request, _(
                '%(count)d documents of type "%(document_type)s" added to the '
                'OCR queue.'
            ) % {
                'count': count,
                'document_type': form.cleaned_data['document_type']
            }
        )
        return HttpResponseRedirect(self.get_success_url())
 class DocumentTypeSettingsEditView(SingleObjectEditView):
    fields = ('auto_ocr',)
    view_permission = permission_document_type_ocr_setup
    def get_object(self, queryset=None):
        return get_object_or_404(
            DocumentType, pk=self.kwargs['pk']
        ).ocr_settings
    def get_extra_context(self):
        return {
            'title': _(
                'Edit OCR settings for document type: %s'
            ) % self.get_object().document_type
        }
 class DocumentOCRContent(SingleObjectDetailView):
    form_class = DocumentContentForm
    model = Document
    object_permission = permission_ocr_content_view
    def dispatch(self, request, *args, **kwargs):
        result = super(DocumentOCRContent, self).dispatch(
            request, *args, **kwargs
        )
        self.get_object().add_as_recent_document_for_user(request.user)
        return result
    def get_extra_context(self):
        return {
            'document': self.get_object(),
            'hide_labels': True,
            'object': self.get_object(),
            'title': _('OCR result for document: %s') % self.get_object(),
        }
 class EntryListView(SingleObjectListView):
    extra_context = {
        'hide_object': True,
        'title': _('OCR errors'),
    }
    view_permission = permission_ocr_document
    def get_object_list(self):
        return DocumentVersionOCRError.objects.all()
 class DocumentOCRErrorsListView(SingleObjectListView):
    view_permission = permission_ocr_document
    def get_document(self):
        return get_object_or_404(Document, pk=self.kwargs['pk'])
    def get_extra_context(self):
        return {
            'hide_object': True,
            'object': self.get_document(),
            'title': _('OCR errors for document: %s') % self.get_document(),
        }
    def get_object_list(self):
        return self.get_document().latest_version.ocr_errors.all()
 class DocumentOCRDownloadView(SingleObjectDownloadView):
    model = Document
    object_permission = permission_ocr_content_view
    def get_file(self):
        file_object = DocumentOCRDownloadView.TextIteratorIO(
            iterator=get_document_ocr_content(document=self.get_object())
        )
        return DocumentOCRDownloadView.VirtualFile(
            file=file_object, name='{}-OCR'.format(self.get_object())
        )
--- a/mayan/settings/base.py
+++ b/mayan/settings/base.py
@@ -84,6 +84,7 @@ INSTALLED_APPS = (
    'checkouts',
    'document_comments',
    'document_indexing',
    'document_parsing',
    'document_signatures',
    'document_states',
    'documents',
		`@@ -0,0 +1,3 @@`
							`from __future__ import unicode_literals`

							`default_app_config = 'document_parsing.apps.DocumentParsingApp'`