Initial commit of the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2017-08-23 02:23:14 -04:00
parent 317d07a355
commit e9591c92f9
25 changed files with 1350 additions and 0 deletions
--- a/mayan/apps/document_parsing/tests/init.py
+++ b/mayan/apps/document_parsing/tests/init.py
--- a/mayan/apps/document_parsing/tests/test_api.py
+++ b/mayan/apps/document_parsing/tests/test_api.py
@@ -0,0 +1,88 @@
+from __future__ import unicode_literals
+
+import json
+
+from django.contrib.auth import get_user_model
+from django.urls import reverse
+
+from rest_framework import status
+
+from documents.models import DocumentType
+from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
+from rest_api.tests import BaseAPITestCase
+from user_management.tests import (
+    TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
+)
+
+
+class OCRAPITestCase(BaseAPITestCase):
+    """
+    Test the OCR app API endpoints
+    """
+
+    def setUp(self):
+        super(OCRAPITestCase, self).setUp()
+
+        self.admin_user = get_user_model().objects.create_superuser(
+            username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
+            password=TEST_ADMIN_PASSWORD
+        )
+
+        self.client.login(
+            username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
+        )
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=file_object,
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(OCRAPITestCase, self).tearDown()
+
+    def test_submit_document(self):
+        response = self.client.post(
+            reverse(
+                'rest_api:document-ocr-submit-view',
+                args=(self.document.pk,)
+            )
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
+
+        content = self.document.pages.first().ocr_content.content
+
+        self.assertTrue('Mayan EDMS Documentation' in content)
+
+    def test_submit_document_version(self):
+        response = self.client.post(
+            reverse(
+                'rest_api:document-version-ocr-submit-view',
+                args=(self.document.latest_version.pk,)
+            )
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
+
+        content = self.document.pages.first().ocr_content.content
+
+        self.assertTrue('Mayan EDMS Documentation' in content)
+
+    def test_get_document_version_page_content(self):
+        response = self.client.get(
+            reverse(
+                'rest_api:document-page-content-view',
+                args=(self.document.latest_version.pages.first().pk,)
+            ),
+        )
+
+        self.assertEqual(response.status_code, status.HTTP_200_OK)
+
+        self.assertTrue(
+            'Mayan EDMS Documentation' in json.loads(response.content)['content']
+        )
--- a/mayan/apps/document_parsing/tests/test_events.py
+++ b/mayan/apps/document_parsing/tests/test_events.py
@@ -0,0 +1,41 @@
+from __future__ import unicode_literals
+
+from actstream.models import Action
+
+from documents.tests.test_models import GenericDocumentTestCase
+
+from ..events import (
+    event_ocr_document_version_submit, event_ocr_document_version_finish
+)
+
+
+class OCREventsTestCase(GenericDocumentTestCase):
+    def test_document_version_submit_event(self):
+        Action.objects.all().delete()
+        self.document.submit_for_ocr()
+
+        self.assertEqual(
+            Action.objects.first().target, self.document.latest_version
+        )
+        self.assertEqual(
+            Action.objects.first().verb,
+            event_ocr_document_version_submit.name
+        )
+
+    def test_document_version_finish_event(self):
+        Action.objects.all().delete()
+        self.document.submit_for_ocr()
+        from ..models import DocumentVersionOCRError, DocumentPageContent
+        #print DocumentVersionOCRError.objects.all()
+        print DocumentPageContent.objects.all()
+
+        for a in Action.objects.all():
+            print a
+
+        self.assertEqual(
+            Action.objects.last().target, self.document.latest_version
+        )
+        self.assertEqual(
+            Action.objects.last().verb,
+            event_ocr_document_version_finish.name
+        )
--- a/mayan/apps/document_parsing/tests/test_models.py
+++ b/mayan/apps/document_parsing/tests/test_models.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+
+from __future__ import unicode_literals
+
+from common.tests import BaseTestCase
+from documents.models import DocumentType
+from documents.settings import setting_language_choices
+from documents.tests import (
+    TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
+)
+
+
+class DocumentOCRTestCase(BaseTestCase):
+    # PyOCR's leak descriptor in get_available_languages and image_to_string
+    # Disable descriptor leak test until fixed in upstream
+    _skip_file_descriptor_test = True
+
+    def setUp(self):
+        super(DocumentOCRTestCase, self).setUp()
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=file_object,
+            )
+
+    def tearDown(self):
+        self.document.delete()
+        self.document_type.delete()
+        super(DocumentOCRTestCase, self).tearDown()
+
+    def test_ocr_language_backends_end(self):
+        content = self.document.pages.first().ocr_content.content
+        self.assertTrue('Mayan EDMS Documentation' in content)
+
+
+class GermanOCRSupportTestCase(BaseTestCase):
+    # PyOCR's leak descriptor in get_available_languages and image_to_string
+    # Disable descriptor leak test until fixed in upstream
+    _skip_file_descriptor_test = True
+
+    def setUp(self):
+        super(GermanOCRSupportTestCase, self).setUp()
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        # Get corresponding language code for German from the default language
+        # choices list
+        language_code = [
+            language for language in setting_language_choices.value if language[1] == 'German'
+        ][0][0]
+
+        self.assertEqual('deu', language_code)
+
+        with open(TEST_DEU_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=file_object, language=language_code
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(GermanOCRSupportTestCase, self).tearDown()
+
+    def test_ocr_language_backends_end(self):
+        content = self.document.pages.first().ocr_content.content
+
+        self.assertTrue(
+            'Repository für elektronische Dokumente.' in content
+        )
+        self.assertTrue(
+            'Es bietet einen' in content
+        )
--- a/mayan/apps/document_parsing/tests/test_parsers.py
+++ b/mayan/apps/document_parsing/tests/test_parsers.py
@@ -0,0 +1,83 @@
+from __future__ import unicode_literals
+
+from django.core.files.base import File
+from django.test import override_settings
+
+from common.tests import BaseTestCase
+from documents.models import DocumentType
+from documents.tests import (
+    TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
+)
+
+from ..classes import TextExtractor
+from ..parsers import PDFMinerParser, PopplerParser
+
+
+@override_settings(OCR_AUTO_OCR=False)
+class ParserTestCase(BaseTestCase):
+    def setUp(self):
+        super(ParserTestCase, self).setUp()
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=File(file_object)
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(ParserTestCase, self).tearDown()
+
+    def test_pdfminer_parser(self):
+        parser = PDFMinerParser()
+
+        parser.process_document_version(self.document.latest_version)
+
+        self.assertTrue(
+            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+        )
+
+    def test_poppler_parser(self):
+        parser = PopplerParser()
+
+        parser.process_document_version(self.document.latest_version)
+
+        self.assertTrue(
+            'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
+        )
+
+
+@override_settings(OCR_AUTO_OCR=False)
+class TextExtractorTestCase(BaseTestCase):
+    def setUp(self):
+        super(TextExtractorTestCase, self).setUp()
+
+        self.document_type = DocumentType.objects.create(
+            label=TEST_DOCUMENT_TYPE_LABEL
+        )
+
+        with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
+            self.document = self.document_type.new_document(
+                file_object=File(file_object)
+            )
+
+    def tearDown(self):
+        self.document_type.delete()
+        super(TextExtractorTestCase, self).tearDown()
+
+    def test_text_extractor(self):
+        TextExtractor.process_document_version(
+            document_version=self.document.latest_version
+        )
+
+        self.assertEqual(
+            self.document.latest_version.pages.first().ocr_content.content,
+            'Sample text',
+        )
+
+        self.assertEqual(
+            self.document.latest_version.pages.last().ocr_content.content,
+            'Sample text in image form',
+        )
--- a/mayan/apps/document_parsing/tests/test_views.py
+++ b/mayan/apps/document_parsing/tests/test_views.py
@@ -0,0 +1,61 @@
+from __future__ import unicode_literals
+
+from django.test import override_settings
+
+from documents.tests.test_views import GenericDocumentViewTestCase
+
+from ..permissions import permission_ocr_content_view
+from ..utils import get_document_ocr_content
+
+
+@override_settings(OCR_AUTO_OCR=True)
+class OCRViewsTestCase(GenericDocumentViewTestCase):
+    # PyOCR's leak descriptor in get_available_languages and image_to_string
+    # Disable descriptor leak test until fixed in upstream
+    _skip_file_descriptor_test = True
+
+    def setUp(self):
+        super(OCRViewsTestCase, self).setUp()
+        self.login_user()
+
+    def _document_content_view(self):
+        return self.get(
+            'ocr:document_content', args=(self.document.pk,)
+        )
+
+    def test_document_content_view_no_permissions(self):
+        response = self._document_content_view()
+
+        self.assertEqual(response.status_code, 403)
+
+    def test_document_content_view_with_permission(self):
+        self.grant_permission(permission=permission_ocr_content_view)
+
+        response = self._document_content_view()
+
+        self.assertContains(
+            response, 'Mayan EDMS Documentation', status_code=200
+        )
+
+    def test_document_ocr_download_view_no_permission(self):
+        response = self.get(
+            'ocr:document_ocr_download', args=(self.document.pk,)
+        )
+
+        self.assertEqual(response.status_code, 403)
+
+    def test_document_download_view_with_permission(self):
+        self.expected_content_type = 'application/octet-stream; charset=utf-8'
+
+        self.grant_permission(permission=permission_ocr_content_view)
+        response = self.get(
+            'ocr:document_ocr_download', args=(self.document.pk,)
+        )
+
+        self.assertEqual(response.status_code, 200)
+
+        self.assert_download_response(
+            response, content=(
+                ''.join(get_document_ocr_content(document=self.document))
+            ),
+        )