Initial commit of the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2017-08-23 02:23:14 -04:00
parent 317d07a355
commit e9591c92f9
25 changed files with 1350 additions and 0 deletions
--- a/mayan/apps/document_parsing/init.py
+++ b/mayan/apps/document_parsing/init.py
@@ -0,0 +1,3 @@
+from __future__ import unicode_literals
+
+default_app_config = 'document_parsing.apps.DocumentParsingApp'
--- a/mayan/apps/document_parsing/admin.py
+++ b/mayan/apps/document_parsing/admin.py
@@ -0,0 +1,23 @@
+from __future__ import unicode_literals
+
+from django.contrib import admin
+
+from .models import (
+    DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
+)
+
+
+@admin.register(DocumentPageContent)
+class DocumentPageContentAdmin(admin.ModelAdmin):
+    list_display = ('document_page',)
+
+
+@admin.register(DocumentTypeSettings)
+class DocumentTypeSettingsAdmin(admin.ModelAdmin):
+    list_display = ('document_type', 'auto_ocr')
+
+
+@admin.register(DocumentVersionOCRError)
+class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
+    list_display = ('document_version', 'datetime_submitted')
+    readonly_fields = ('document_version', 'datetime_submitted', 'result')
--- a/mayan/apps/document_parsing/api_views.py
+++ b/mayan/apps/document_parsing/api_views.py
@@ -0,0 +1,97 @@
+from __future__ import absolute_import, unicode_literals
+
+from rest_framework import generics, status
+from rest_framework.response import Response
+
+from documents.models import Document, DocumentPage, DocumentVersion
+from rest_api.permissions import MayanPermission
+
+from .models import DocumentPageContent
+from .permissions import permission_ocr_content_view, permission_ocr_document
+from .serializers import DocumentPageContentSerializer
+
+
+class APIDocumentOCRView(generics.GenericAPIView):
+    mayan_object_permissions = {
+        'POST': (permission_ocr_document,)
+    }
+    permission_classes = (MayanPermission,)
+    queryset = Document.objects.all()
+
+    def get_serializer_class(self):
+        return None
+
+    def post(self, request, *args, **kwargs):
+        """
+        Submit a document for OCR.
+        ---
+        omit_serializer: true
+        parameters:
+            - name: pk
+              paramType: path
+              type: number
+        responseMessages:
+            - code: 202
+              message: Accepted
+        """
+
+        self.get_object().submit_for_ocr()
+        return Response(status=status.HTTP_202_ACCEPTED)
+
+
+class APIDocumentVersionOCRView(generics.GenericAPIView):
+    mayan_object_permissions = {
+        'POST': (permission_ocr_document,)
+    }
+    permission_classes = (MayanPermission,)
+    queryset = DocumentVersion.objects.all()
+
+    def get_serializer_class(self):
+        return None
+
+    def post(self, request, *args, **kwargs):
+        """
+        Submit a document version for OCR.
+        ---
+        omit_serializer: true
+        parameters:
+            - name: pk
+              paramType: path
+              type: number
+        responseMessages:
+            - code: 202
+              message: Accepted
+        """
+
+        self.get_object().submit_for_ocr()
+        return Response(status=status.HTTP_202_ACCEPTED)
+
+
+class APIDocumentPageContentView(generics.RetrieveAPIView):
+    """
+    Returns the OCR content of the selected document page.
+    ---
+    GET:
+        parameters:
+            - name: pk
+              paramType: path
+              type: number
+    """
+
+    mayan_object_permissions = {
+        'GET': (permission_ocr_content_view,),
+    }
+    permission_classes = (MayanPermission,)
+    serializer_class = DocumentPageContentSerializer
+    queryset = DocumentPage.objects.all()
+
+    def retrieve(self, request, *args, **kwargs):
+        instance = self.get_object()
+
+        try:
+            ocr_content = instance.ocr_content
+        except DocumentPageContent.DoesNotExist:
+            ocr_content = DocumentPageContent.objects.none()
+
+        serializer = self.get_serializer(ocr_content)
+        return Response(serializer.data)
--- a/mayan/apps/document_parsing/apps.py
+++ b/mayan/apps/document_parsing/apps.py
@@ -0,0 +1,125 @@
+from __future__ import unicode_literals
+
+import logging
+
+from kombu import Exchange, Queue
+
+from django.apps import apps
+from django.db.models.signals import post_save
+from django.utils.translation import ugettext_lazy as _
+
+from acls import ModelPermission
+from common import (
+    MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
+    menu_tools
+)
+from common.settings import settings_db_sync_task_delay
+from documents.search import document_search, document_page_search
+from documents.signals import post_version_upload
+from documents.widgets import document_link
+from mayan.celery import app
+from navigation import SourceColumn
+from rest_api.classes import APIEndPoint
+
+from .handlers import handler_parse_document_version
+from .links import (
+    link_document_content, link_entry_list, link_document_content_errors_list,
+    link_document_content_download
+)
+from .permissions import permission_content_view
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentParsingApp(MayanAppConfig):
+    has_tests = True
+    name = 'document_parsing'
+    verbose_name = _('Document parsing')
+
+    def ready(self):
+        super(DocumentParsingApp, self).ready()
+
+        APIEndPoint(app=self, version_string='1')
+
+        Document = apps.get_model(
+            app_label='documents', model_name='Document'
+        )
+
+        DocumentType = apps.get_model(
+            app_label='documents', model_name='DocumentType'
+        )
+
+        DocumentVersion = apps.get_model(
+            app_label='documents', model_name='DocumentVersion'
+        )
+
+        DocumentVersionParseError = self.get_model('DocumentVersionParseError')
+
+        ModelPermission.register(
+            model=Document, permissions=(permission_content_view,)
+        )
+
+        SourceColumn(
+            source=DocumentVersionParseError, label=_('Document'),
+            func=lambda context: document_link(context['object'].document_version.document)
+        )
+        SourceColumn(
+            source=DocumentVersionParseError, label=_('Added'),
+            attribute='datetime_submitted'
+        )
+        SourceColumn(
+            source=DocumentVersionParseError, label=_('Result'),
+            attribute='result'
+        )
+
+        document_search.add_model_field(
+            field='versions__pages__content__content', label=_('Content')
+        )
+
+        document_page_search.add_model_field(
+            field='content__content', label=_('Content')
+        )
+
+        menu_facet.bind_links(
+            links=(link_document_content,), sources=(Document,)
+        )
+        menu_multi_item.bind_links(
+            links=(link_document_submit_multiple,), sources=(Document,)
+        )
+        menu_object.bind_links(
+            links=(link_document_submit,), sources=(Document,)
+        )
+        menu_object.bind_links(
+            links=(link_document_type_ocr_settings,), sources=(DocumentType,)
+        )
+        menu_secondary.bind_links(
+            links=(
+                link_document_content, link_document_ocr_erros_list,
+                link_document_ocr_download
+            ),
+            sources=(
+                'document_parsing:document_content',
+                'document_parsing:document_ocr_error_list',
+                'document_parsing:document_ocr_download',
+            )
+        )
+        menu_secondary.bind_links(
+            links=(link_entry_list,),
+            sources=(
+                'document_parsing:entry_list',
+                'document_parsing:entry_delete_multiple',
+                'document_parsing:entry_re_queue_multiple',
+                DocumentVersionParseError
+            )
+        )
+        menu_tools.bind_links(
+            links=(
+                link_entry_list
+            )
+        )
+
+        post_version_upload.connect(
+            dispatch_uid='document_parsing_handler_parse_document_version',
+            receiver=handler_parse_document_version,
+            sender=DocumentVersion
+        )
--- a/mayan/apps/document_parsing/exceptions.py
+++ b/mayan/apps/document_parsing/exceptions.py
@@ -0,0 +1,22 @@
+from __future__ import unicode_literals
+
+
+class OCRError(Exception):
+    """
+    Raised by the OCR backend
+    """
+    pass
+
+
+class ParserError(Exception):
+    """
+    Base exception for file parsers
+    """
+    pass
+
+
+class NoMIMETypeMatch(ParserError):
+    """
+    There is no parser registered for the specified MIME type
+    """
+    pass
--- a/mayan/apps/document_parsing/forms.py
+++ b/mayan/apps/document_parsing/forms.py
@@ -0,0 +1,104 @@
+from __future__ import unicode_literals
+
+from django import forms
+from django.utils.encoding import force_text
+from django.utils.html import conditional_escape
+from django.utils.safestring import mark_safe
+from django.utils.translation import ugettext_lazy as _, ugettext
+
+from common.widgets import TextAreaDiv
+from documents.models import DocumentType
+
+from .models import DocumentPageContent, DocumentPageOCRContent
+
+
+class DocumentContentForm(forms.Form):
+    """
+    Form that concatenates all of a document pages' text content into a
+    single textarea widget
+    """
+    def __init__(self, *args, **kwargs):
+        self.document = kwargs.pop('instance', None)
+        super(DocumentContentForm, self).__init__(*args, **kwargs)
+        content = []
+        self.fields['contents'].initial = ''
+        try:
+            document_pages = self.document.pages.all()
+        except AttributeError:
+            document_pages = []
+
+        for page in document_pages:
+            try:
+                page_content = page.ocr_content.content
+            except DocumentPageContent.DoesNotExist:
+                pass
+            else:
+                content.append(conditional_escape(force_text(page_content)))
+                content.append(
+                    '\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
+                        ugettext(
+                            'Page %(page_number)d'
+                        ) % {'page_number': page.page_number}
+                    )
+                )
+
+        self.fields['contents'].initial = mark_safe(''.join(content))
+
+    contents = forms.CharField(
+        label=_('Contents'),
+        widget=TextAreaDiv(
+            attrs={
+                'class': 'text_area_div full-height',
+                'data-height-difference': 360
+            }
+        )
+    )
+
+
+class DocumentOCRContentForm(forms.Form):
+    """
+    Form that concatenates all of a document pages' text content into a
+    single textarea widget
+    """
+    def __init__(self, *args, **kwargs):
+        self.document = kwargs.pop('instance', None)
+        super(DocumentContentForm, self).__init__(*args, **kwargs)
+        content = []
+        self.fields['contents'].initial = ''
+        try:
+            document_pages = self.document.pages.all()
+        except AttributeError:
+            document_pages = []
+
+        for page in document_pages:
+            try:
+                page_content = page.ocr_content.content
+            except DocumentPageOCRContent.DoesNotExist:
+                pass
+            else:
+                content.append(conditional_escape(force_text(page_content)))
+                content.append(
+                    '\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
+                        ugettext(
+                            'Page %(page_number)d'
+                        ) % {'page_number': page.page_number}
+                    )
+                )
+
+        self.fields['contents'].initial = mark_safe(''.join(content))
+
+    contents = forms.CharField(
+        label=_('Contents'),
+        widget=TextAreaDiv(
+            attrs={
+                'class': 'text_area_div full-height',
+                'data-height-difference': 360
+            }
+        )
+    )
+
+
+class DocumentTypeSelectForm(forms.Form):
+    document_type = forms.ModelChoiceField(
+        queryset=DocumentType.objects.all(), label=('Document type')
+    )
--- a/mayan/apps/document_parsing/handlers.py
+++ b/mayan/apps/document_parsing/handlers.py
@@ -0,0 +1,15 @@
+from __future__ import unicode_literals
+
+import logging
+
+from django.apps import apps
+
+from .settings import setting_auto_ocr
+from .parsers import Parser
+
+logger = logging.getLogger(__name__)
+
+
+def handler_parse_document_version(sender, instance, **kwargs):
+    if kwargs['created']:
+        Parser.parse_document_version(document_version=instance)
--- a/mayan/apps/document_parsing/links.py
+++ b/mayan/apps/document_parsing/links.py
@@ -0,0 +1,27 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from navigation import Link
+
+from .permissions import permission_content_view
+
+link_document_content = Link(
+    args='resolved_object.id', icon='fa fa-font',
+    permissions=(permission_content_view,), text=_('Content'),
+    view='document_parsing:document_content',
+)
+link_entry_list = Link(
+    icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
+    text=_('Parsing errors'), view='document_parsing:entry_list'
+)
+link_document_content_errors_list = Link(
+    args='resolved_object.id', icon='fa fa-file-text-o',
+    permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
+    view='document_parsing:document_page_parsing_error_list'
+)
+link_document_content_download = Link(
+    args='resolved_object.id', icon='fa fa-file-text-o',
+    permissions=(permission_ocr_content_view,), text=_('Download content'),
+    view='document_parsing:document_content_download'
+)
--- a/mayan/apps/document_parsing/managers.py
+++ b/mayan/apps/document_parsing/managers.py
@@ -0,0 +1,14 @@
+from __future__ import unicode_literals
+
+from datetime import timedelta
+import logging
+
+from django.apps import apps
+from django.db import models
+from django.utils.timezone import now
+
+logger = logging.getLogger(__name__)
+
+
+class DocumentPageContentManager(models.Manager):
+    pass
--- a/mayan/apps/document_parsing/models.py
+++ b/mayan/apps/document_parsing/models.py
@@ -0,0 +1,47 @@
+from __future__ import unicode_literals
+
+from django.db import models
+from django.utils.encoding import force_text, python_2_unicode_compatible
+from django.utils.translation import ugettext_lazy as _
+
+from documents.models import DocumentPage, DocumentType, DocumentVersion
+
+from .managers import DocumentPageContentManager
+
+
+@python_2_unicode_compatible
+class DocumentPageContent(models.Model):
+    document_page = models.OneToOneField(
+        DocumentPage, on_delete=models.CASCADE, related_name='content',
+        verbose_name=_('Document page')
+    )
+    content = models.TextField(blank=True, verbose_name=_('Content'))
+
+    objects = DocumentPageContentManager()
+
+    def __str__(self):
+        return force_text(self.document_page)
+
+    class Meta:
+        verbose_name = _('Document page content')
+        verbose_name_plural = _('Document pages contents')
+
+
+@python_2_unicode_compatible
+class DocumentVersionParseError(models.Model):
+    document_version = models.ForeignKey(
+        DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
+        verbose_name=_('Document version')
+    )
+    datetime_submitted = models.DateTimeField(
+        auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
+    )
+    result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
+
+    def __str__(self):
+        return force_text(self.document_version)
+
+    class Meta:
+        ordering = ('datetime_submitted',)
+        verbose_name = _('Document version parse error')
+        verbose_name_plural = _('Document version parse errors')
--- a/mayan/apps/document_parsing/parsers.py
+++ b/mayan/apps/document_parsing/parsers.py
@@ -0,0 +1,202 @@
+from __future__ import unicode_literals
+
+from io import BytesIO
+import logging
+import os
+from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
+from pdfminer.pdfpage import PDFPage
+from pdfminer.converter import TextConverter
+from pdfminer.layout import LAParams
+import subprocess
+
+from django.utils.translation import ugettext_lazy as _
+
+from common.utils import copyfile, fs_cleanup, mkstemp
+
+from .exceptions import ParserError, NoMIMETypeMatch
+from .models import DocumentPageContent
+from .settings import setting_pdftotext_path
+
+logger = logging.getLogger(__name__)
+
+
+class Parser(object):
+    """
+    Parser base class
+    """
+
+    _registry = {}
+
+    @classmethod
+    def register(cls, mimetypes, parser_classes):
+        for mimetype in mimetypes:
+            for parser_class in parser_classes:
+                cls._registry.setdefault(
+                    mimetype, []
+                ).append(parser_class)
+
+    @classmethod
+    def parse_document_version(cls, document_version):
+        try:
+            for parser_class in cls._registry[document_version.mimetype]:
+                try:
+                    parser = parser_class()
+                    parser.process_document_version(document_version)
+                except ParserError:
+                    # If parser raises error, try next parser in the list
+                    pass
+                else:
+                    # If parser was successfull there is no need to try
+                    # others in the list for this mimetype
+                    return
+
+            raise NoMIMETypeMatch('Parser MIME type list exhausted')
+        except KeyError:
+            raise NoMIMETypeMatch
+
+    @classmethod
+    def parse_document_page(cls, document_page):
+        try:
+            for parser_class in cls._registry[document_page.document_version.mimetype]:
+                try:
+                    parser = parser_class()
+                    parser.process_document_page(document_page)
+                except ParserError:
+                    # If parser raises error, try next parser in the list
+                    pass
+                else:
+                    # If parser was successfull there is no need to try
+                    # others in the list for this mimetype
+                    return
+            raise NoMIMETypeMatch('Parser MIME type list exhausted')
+        except KeyError:
+            raise NoMIMETypeMatch
+
+    def process_document_version(self, document_version):
+        logger.info(
+            'Starting parsing for document version: %s', document_version
+        )
+        logger.debug('document version: %d', document_version.pk)
+
+        for document_page in document_version.pages.all():
+            self.process_document_page(document_page=document_page)
+
+    def process_document_page(self, document_page):
+        logger.info(
+            'Processing page: %d of document version: %s',
+            document_page.page_number, document_page.document_version
+        )
+
+        file_object = document_page.document_version.get_intermidiate_file()
+
+        try:
+            document_page_content, created = DocumentPageContent.objects.get_or_create(
+                document_page=document_page
+            )
+            document_page_content.content = self.execute(
+                file_object=file_object, page_number=document_page.page_number
+            )
+            document_page_content.save()
+        except Exception as exception:
+            error_message = _('Exception parsing page; %s') % exception
+            logger.error(error_message)
+            raise ParserError(error_message)
+        finally:
+            file_object.close()
+
+        logger.info(
+            'Finished processing page: %d of document version: %s',
+            document_page.page_number, document_page.document_version
+        )
+
+    def execute(self, file_object, page_number):
+        raise NotImplementedError(
+            'Your %s class has not defined the required execute() method.' %
+            self.__class__.__name__
+        )
+
+
+class PopplerParser(Parser):
+    """
+    PDF parser using the pdftotext execute from the poppler package
+    """
+
+    def __init__(self):
+        self.pdftotext_path = setting_pdftotext_path.value
+        if not os.path.exists(self.pdftotext_path):
+            error_message = _(
+                'Cannot find pdftotext executable at: %s'
+            ) % self.pdftotext_path
+            logger.error(error_message)
+            raise ParserError(error_message)
+
+        logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
+
+    def execute(self, file_object, page_number):
+        logger.debug('Parsing PDF page: %d', page_number)
+
+        destination_descriptor, temp_filepath = mkstemp()
+        copyfile(file_object, temp_filepath)
+
+        command = []
+        command.append(self.pdftotext_path)
+        command.append('-f')
+        command.append(str(page_number))
+        command.append('-l')
+        command.append(str(page_number))
+        command.append(temp_filepath)
+        command.append('-')
+
+        proc = subprocess.Popen(
+            command, close_fds=True, stderr=subprocess.PIPE,
+            stdout=subprocess.PIPE
+        )
+        return_code = proc.wait()
+        if return_code != 0:
+            logger.error(proc.stderr.readline())
+            fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
+
+            raise ParserError
+
+        output = proc.stdout.read()
+        fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
+
+        if output == b'\x0c':
+            logger.debug('Parser didn\'t return any output')
+            return ''
+
+        if output[-3:] == b'\x0a\x0a\x0c':
+            return output[:-3]
+
+        return output
+
+
+class PDFMinerParser(Parser):
+    """
+    Parser for PDF files using the PDFMiner library for Python
+    """
+
+    def execute(self, file_object, page_number):
+        logger.debug('Parsing PDF page: %d', page_number)
+
+        with BytesIO() as string_buffer:
+            rsrcmgr = PDFResourceManager()
+            device = TextConverter(
+                rsrcmgr, outfp=string_buffer, laparams=LAParams()
+            )
+            interpreter = PDFPageInterpreter(rsrcmgr, device)
+            page = PDFPage.get_pages(
+                file_object, maxpages=1, pagenos=(page_number - 1,)
+            )
+            interpreter.process_page(page.next())
+            device.close()
+
+            logger.debug('Finished parsing PDF: %d', page_number)
+
+            return string_buffer.getvalue()
+
+
+Parser.register(
+    mimetypes=('application/pdf',),
+    parser_classes=(PopplerParser, PDFMinerParser)
+)
--- a/mayan/apps/document_parsing/permissions.py
+++ b/mayan/apps/document_parsing/permissions.py
@@ -0,0 +1,11 @@
+from __future__ import absolute_import, unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from permissions import PermissionNamespace
+
+namespace = PermissionNamespace('document_parsing', _('Document parsing'))
+
+permission_content_view = namespace.add_permission(
+    name='content_view', label=_('View the content of a document')
+)
--- a/mayan/apps/document_parsing/queues.py
+++ b/mayan/apps/document_parsing/queues.py
@@ -0,0 +1,10 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from task_manager.classes import CeleryQueue
+
+queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
+queue_ocr.add_task_type(
+    name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
+)
--- a/mayan/apps/document_parsing/serializers.py
+++ b/mayan/apps/document_parsing/serializers.py
@@ -0,0 +1,11 @@
+from __future__ import unicode_literals
+
+from rest_framework import serializers
+
+from .models import DocumentPageContent
+
+
+class DocumentPageContentSerializer(serializers.ModelSerializer):
+    class Meta:
+        fields = ('content',)
+        model = DocumentPageContent
--- a/mayan/apps/document_parsing/settings.py
+++ b/mayan/apps/document_parsing/settings.py
@@ -0,0 +1,17 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from smart_settings import Namespace
+
+namespace = Namespace(name='document_parsing', label=_('Document parsing'))
+
+setting_pdftotext_path = namespace.add_setting(
+    global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH',
+    default='/usr/bin/pdftotext',
+    help_text=_(
+        'File path to poppler\'s pdftotext program used to extract text '
+        'from PDF files.'
+    ),
+    is_path=True
+)
--- a/mayan/apps/document_parsing/tests/init.py
+++ b/mayan/apps/document_parsing/tests/init.py
--- a/mayan/apps/document_parsing/tests/test_api.py
+++ b/mayan/apps/document_parsing/tests/test_api.py
@@ -0,0 +1,88 @@
+from __future__ import unicode_literals
+
+import json
+
+from django.contrib.auth import get_user_model
+from django.urls import reverse
+
+from rest_framework import status
+
+from documents.models import DocumentType
+from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
+from rest_api.tests import BaseAPITestCase
+from user_management.tests import (
+    TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
+)
+
+
+class OCRAPITestCase(BaseAPITestCase):
+    """
+    Test the OCR app API endpoints
+    """
+
+    def setUp(self):
+        super(OCRAPITestCase, self).setUp()
+
+        self.admin_user = get_user_model().objects.create_superuser(
+            username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
+            password=TEST_ADMIN_PASSWORD
+        )
+
+        self.client.login(
+            username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
+        )
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=file_object,
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(OCRAPITestCase, self).tearDown()
+
+    def test_submit_document(self):
+        response = self.client.post(
+            reverse(
+                'rest_api:document-ocr-submit-view',
+                args=(self.document.pk,)
+            )
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
+
+        content = self.document.pages.first().ocr_content.content
+
+        self.assertTrue('Mayan EDMS Documentation' in content)
+
+    def test_submit_document_version(self):
+        response = self.client.post(
+            reverse(
+                'rest_api:document-version-ocr-submit-view',
+                args=(self.document.latest_version.pk,)
+            )
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
+
+        content = self.document.pages.first().ocr_content.content
+
+        self.assertTrue('Mayan EDMS Documentation' in content)
+
+    def test_get_document_version_page_content(self):
+        response = self.client.get(
+            reverse(
+                'rest_api:document-page-content-view',
+                args=(self.document.latest_version.pages.first().pk,)
+            ),
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+        self.assertTrue(
+            'Mayan EDMS Documentation' in json.loads(response.content)['content']
+        )
--- a/mayan/apps/document_parsing/tests/test_events.py
+++ b/mayan/apps/document_parsing/tests/test_events.py
@@ -0,0 +1,41 @@
+from __future__ import unicode_literals
+
+from actstream.models import Action
+
+from documents.tests.test_models import GenericDocumentTestCase
+
+from ..events import (
+    event_ocr_document_version_submit, event_ocr_document_version_finish
+)
+
+
+class OCREventsTestCase(GenericDocumentTestCase):
+    def test_document_version_submit_event(self):
+        Action.objects.all().delete()
+        self.document.submit_for_ocr()
+
+        self.assertEqual(
+            Action.objects.first().target, self.document.latest_version
+        )
+        self.assertEqual(
+            Action.objects.first().verb,
+            event_ocr_document_version_submit.name
+        )
+
+    def test_document_version_finish_event(self):
+        Action.objects.all().delete()
+        self.document.submit_for_ocr()
+        from ..models import DocumentVersionOCRError, DocumentPageContent
+        #print DocumentVersionOCRError.objects.all()
+        print DocumentPageContent.objects.all()
+
+        for a in Action.objects.all():
+            print a
+
+        self.assertEqual(
+            Action.objects.last().target, self.document.latest_version
+        )
+        self.assertEqual(
+            Action.objects.last().verb,
+            event_ocr_document_version_finish.name
+        )
--- a/mayan/apps/document_parsing/tests/test_models.py
+++ b/mayan/apps/document_parsing/tests/test_models.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from common.tests import BaseTestCase
+from documents.models import DocumentType
+from documents.settings import setting_language_choices
+from documents.tests import (
+    TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
+)
+
+
+class DocumentOCRTestCase(BaseTestCase):
+    # PyOCR's leak descriptor in get_available_languages and image_to_string
+    # Disable descriptor leak test until fixed in upstream
+    _skip_file_descriptor_test = True
+
+    def setUp(self):
+        super(DocumentOCRTestCase, self).setUp()
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=file_object,
+            )
+
+    def tearDown(self):
+        self.document.delete()
+        self.document_type.delete()
+        super(DocumentOCRTestCase, self).tearDown()
+
+    def test_ocr_language_backends_end(self):
+        content = self.document.pages.first().ocr_content.content
+        self.assertTrue('Mayan EDMS Documentation' in content)
+
+
+class GermanOCRSupportTestCase(BaseTestCase):
+    # PyOCR's leak descriptor in get_available_languages and image_to_string
+    # Disable descriptor leak test until fixed in upstream
+    _skip_file_descriptor_test = True
+
+    def setUp(self):
+        super(GermanOCRSupportTestCase, self).setUp()
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        # Get corresponding language code for German from the default language
+        # choices list
+        language_code = [
+            language for language in setting_language_choices.value if language[1] == 'German'
+        ][0][0]
+
+        self.assertEqual('deu', language_code)
+
+        with open(TEST_DEU_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=file_object, language=language_code
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(GermanOCRSupportTestCase, self).tearDown()
+
+    def test_ocr_language_backends_end(self):
+        content = self.document.pages.first().ocr_content.content
+
+        self.assertTrue(
+            'Repository für elektronische Dokumente.' in content
+        )
+        self.assertTrue(
+            'Es bietet einen' in content
+        )
--- a/mayan/apps/document_parsing/tests/test_parsers.py
+++ b/mayan/apps/document_parsing/tests/test_parsers.py
@@ -0,0 +1,83 @@
+from __future__ import unicode_literals
+
+from django.core.files.base import File
+from django.test import override_settings
+
+from common.tests import BaseTestCase
+from documents.models import DocumentType
+from documents.tests import (
+    TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
+)
+
+from ..classes import TextExtractor
+from ..parsers import PDFMinerParser, PopplerParser
+
+
+@override_settings(OCR_AUTO_OCR=False)
+class ParserTestCase(BaseTestCase):
+    def setUp(self):
+        super(ParserTestCase, self).setUp()
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=File(file_object)
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(ParserTestCase, self).tearDown()
+
+    def test_pdfminer_parser(self):
+        parser = PDFMinerParser()
+
+        parser.process_document_version(self.document.latest_version)
+
+        self.assertTrue(
+            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+        )
+
+    def test_poppler_parser(self):
+        parser = PopplerParser()
+
+        parser.process_document_version(self.document.latest_version)
+
+        self.assertTrue(
+            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+        )
+
+
+@override_settings(OCR_AUTO_OCR=False)
+class TextExtractorTestCase(BaseTestCase):
+    def setUp(self):
+        super(TextExtractorTestCase, self).setUp()
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=File(file_object)
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(TextExtractorTestCase, self).tearDown()
+
+    def test_text_extractor(self):
+        TextExtractor.process_document_version(
+            document_version=self.document.latest_version
+        )
+
+        self.assertEqual(
+            self.document.latest_version.pages.first().ocr_content.content,
+            'Sample text',
+        )
+
+        self.assertEqual(
+            self.document.latest_version.pages.last().ocr_content.content,
+            'Sample text in image form',
+        )
--- a/mayan/apps/document_parsing/tests/test_views.py
+++ b/mayan/apps/document_parsing/tests/test_views.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+from django.test import override_settings
+
+from documents.tests.test_views import GenericDocumentViewTestCase
+
+from ..permissions import permission_ocr_content_view
+from ..utils import get_document_ocr_content
+
+
+@override_settings(OCR_AUTO_OCR=True)
+class OCRViewsTestCase(GenericDocumentViewTestCase):
+    # PyOCR's leak descriptor in get_available_languages and image_to_string
+    # Disable descriptor leak test until fixed in upstream
+    _skip_file_descriptor_test = True
+
+    def setUp(self):
+        super(OCRViewsTestCase, self).setUp()
+        self.login_user()
+
+    def _document_content_view(self):
+        return self.get(
+            'ocr:document_content', args=(self.document.pk,)
+        )
+
+    def test_document_content_view_no_permissions(self):
+        response = self._document_content_view()
+
+        self.assertEqual(response.status_code, 403)
+
+    def test_document_content_view_with_permission(self):
+        self.grant_permission(permission=permission_ocr_content_view)
+
+        response = self._document_content_view()
+
+        self.assertContains(
+            response, 'Mayan EDMS Documentation', status_code=200
+        )
+
+    def test_document_ocr_download_view_no_permission(self):
+        response = self.get(
+            'ocr:document_ocr_download', args=(self.document.pk,)
+        )
+
+        self.assertEqual(response.status_code, 403)
+
+    def test_document_download_view_with_permission(self):
+        self.expected_content_type = 'application/octet-stream; charset=utf-8'
+
+        self.grant_permission(permission=permission_ocr_content_view)
+        response = self.get(
+            'ocr:document_ocr_download', args=(self.document.pk,)
+        )
+
+        self.assertEqual(response.status_code, 200)
+
+        self.assert_download_response(
+            response, content=(
+                ''.join(get_document_ocr_content(document=self.document))
+            ),
+        )
--- a/mayan/apps/document_parsing/urls.py
+++ b/mayan/apps/document_parsing/urls.py
@@ -0,0 +1,65 @@
+from __future__ import unicode_literals
+
+from django.conf.urls import url
+
+from .api_views import (
+    APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
+)
+from .views import (
+    DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
+    DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
+    DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
+)
+
+urlpatterns = [
+    url(
+        r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
+        name='document_content'
+    ),
+    url(
+        r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
+        name='document_submit'
+    ),
+    url(
+        r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
+        name='document_submit_all'
+    ),
+    url(
+        r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
+        name='document_type_submit'
+    ),
+    url(
+        r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
+        name='document_submit_multiple'
+    ),
+    url(
+        r'^document_type/(?P<pk>\d+)/ocr/settings/$',
+        DocumentTypeSettingsEditView.as_view(),
+        name='document_type_ocr_settings'
+    ),
+    url(
+        r'^documents/(?P<pk>\d+)/ocr/errors/$',
+        DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
+    ),
+    url(
+        r'^documents/(?P<pk>\d+)/ocr/download/$',
+        DocumentOCRDownloadView.as_view(), name='document_ocr_download'
+    ),
+    url(r'^all/$', EntryListView.as_view(), name='entry_list'),
+]
+
+api_urls = [
+    url(
+        r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
+        name='document-ocr-submit-view'
+    ),
+    url(
+        r'^document_version/(?P<pk>\d+)/submit/$',
+        APIDocumentVersionOCRView.as_view(),
+        name='document-version-ocr-submit-view'
+    ),
+    url(
+        r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
+        name='document-page-content-view'
+    ),
+]
--- a/mayan/apps/document_parsing/utils.py
+++ b/mayan/apps/document_parsing/utils.py
@@ -0,0 +1,16 @@
+from __future__ import unicode_literals
+
+from django.utils.encoding import force_text
+from django.utils.html import conditional_escape
+
+from .models import DocumentPageContent
+
+
+def get_document_ocr_content(document):
+    for page in document.pages.all():
+        try:
+            page_content = page.ocr_content.content
+        except DocumentPageContent.DoesNotExist:
+            pass
+        else:
+            yield conditional_escape(force_text(page_content))
--- a/mayan/apps/document_parsing/views.py
+++ b/mayan/apps/document_parsing/views.py
@@ -0,0 +1,190 @@
+from __future__ import absolute_import, unicode_literals
+
+from django.contrib import messages
+from django.http import HttpResponseRedirect
+from django.shortcuts import get_object_or_404
+from django.urls import reverse
+from django.utils.translation import ugettext_lazy as _
+
+from acls.models import AccessControlList
+from common.generics import (
+    ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
+    SingleObjectEditView, SingleObjectListView
+)
+from common.mixins import MultipleInstanceActionMixin
+from documents.models import Document, DocumentType
+
+from .forms import DocumentContentForm, DocumentTypeSelectForm
+from .models import DocumentVersionOCRError
+from .permissions import (
+    permission_ocr_content_view, permission_ocr_document,
+    permission_document_type_ocr_setup
+)
+from .utils import get_document_ocr_content
+
+
+class DocumentAllSubmitView(ConfirmView):
+    extra_context = {'title': _('Submit all documents for OCR?')}
+
+    def get_post_action_redirect(self):
+        return reverse('common:tools_list')
+
+    def view_action(self):
+        count = 0
+        for document in Document.objects.all():
+            document.submit_for_ocr()
+            count += 1
+
+        messages.success(
+            self.request, _('%d documents added to the OCR queue.') % count
+        )
+
+
+class DocumentSubmitView(ConfirmView):
+    def get_extra_context(self):
+        return {
+            'object': self.get_object(),
+            'title': _('Submit "%s" to the OCR queue?') % self.get_object()
+        }
+
+    def get_object(self):
+        return Document.objects.get(pk=self.kwargs['pk'])
+
+    def object_action(self, instance):
+        AccessControlList.objects.check_access(
+            permissions=permission_ocr_document, user=self.request.user,
+            obj=instance
+        )
+
+        instance.submit_for_ocr()
+
+    def view_action(self):
+        instance = self.get_object()
+
+        self.object_action(instance=instance)
+
+        messages.success(
+            self.request,
+            _('Document: %(document)s was added to the OCR queue.') % {
+                'document': instance
+            }
+        )
+
+
+class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
+    model = Document
+    success_message = '%(count)d document submitted to the OCR queue.'
+    success_message_plural = '%(count)d documents submitted to the OCR queue.'
+
+    def get_extra_context(self):
+        # Override the base class method
+        return {
+            'title': _('Submit the selected documents to the OCR queue?')
+        }
+
+
+class DocumentTypeSubmitView(FormView):
+    form_class = DocumentTypeSelectForm
+    extra_context = {
+        'title': _('Submit all documents of a type for OCR')
+    }
+
+    def get_post_action_redirect(self):
+        return reverse('common:tools_list')
+
+    def form_valid(self, form):
+        count = 0
+        for document in form.cleaned_data['document_type'].documents.all():
+            document.submit_for_ocr()
+            count += 1
+
+        messages.success(
+            self.request, _(
+                '%(count)d documents of type "%(document_type)s" added to the '
+                'OCR queue.'
+            ) % {
+                'count': count,
+                'document_type': form.cleaned_data['document_type']
+            }
+        )
+
+        return HttpResponseRedirect(self.get_success_url())
+
+
+class DocumentTypeSettingsEditView(SingleObjectEditView):
+    fields = ('auto_ocr',)
+    view_permission = permission_document_type_ocr_setup
+
+    def get_object(self, queryset=None):
+        return get_object_or_404(
+            DocumentType, pk=self.kwargs['pk']
+        ).ocr_settings
+
+    def get_extra_context(self):
+        return {
+            'title': _(
+                'Edit OCR settings for document type: %s'
+            ) % self.get_object().document_type
+        }
+
+
+class DocumentOCRContent(SingleObjectDetailView):
+    form_class = DocumentContentForm
+    model = Document
+    object_permission = permission_ocr_content_view
+
+    def dispatch(self, request, *args, **kwargs):
+        result = super(DocumentOCRContent, self).dispatch(
+            request, *args, **kwargs
+        )
+        self.get_object().add_as_recent_document_for_user(request.user)
+        return result
+
+    def get_extra_context(self):
+        return {
+            'document': self.get_object(),
+            'hide_labels': True,
+            'object': self.get_object(),
+            'title': _('OCR result for document: %s') % self.get_object(),
+        }
+
+
+class EntryListView(SingleObjectListView):
+    extra_context = {
+        'hide_object': True,
+        'title': _('OCR errors'),
+    }
+    view_permission = permission_ocr_document
+
+    def get_object_list(self):
+        return DocumentVersionOCRError.objects.all()
+
+
+class DocumentOCRErrorsListView(SingleObjectListView):
+    view_permission = permission_ocr_document
+
+    def get_document(self):
+        return get_object_or_404(Document, pk=self.kwargs['pk'])
+
+    def get_extra_context(self):
+        return {
+            'hide_object': True,
+            'object': self.get_document(),
+            'title': _('OCR errors for document: %s') % self.get_document(),
+        }
+
+    def get_object_list(self):
+        return self.get_document().latest_version.ocr_errors.all()
+
+
+class DocumentOCRDownloadView(SingleObjectDownloadView):
+    model = Document
+    object_permission = permission_ocr_content_view
+
+    def get_file(self):
+        file_object = DocumentOCRDownloadView.TextIteratorIO(
+            iterator=get_document_ocr_content(document=self.get_object())
+        )
+        return DocumentOCRDownloadView.VirtualFile(
+            file=file_object, name='{}-OCR'.format(self.get_object())
+        )
--- a/mayan/settings/base.py
+++ b/mayan/settings/base.py
@@ -84,6 +84,7 @@ INSTALLED_APPS = (
    'checkouts',
    'document_comments',
    'document_indexing',
+    'document_parsing',
    'document_signatures',
    'document_states',
    'documents',