Initial commit of the document parsing app.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
0
mayan/apps/document_parsing/tests/__init__.py
Normal file
0
mayan/apps/document_parsing/tests/__init__.py
Normal file
88
mayan/apps/document_parsing/tests/test_api.py
Normal file
88
mayan/apps/document_parsing/tests/test_api.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.urls import reverse
|
||||
|
||||
from rest_framework import status
|
||||
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||
from rest_api.tests import BaseAPITestCase
|
||||
from user_management.tests import (
|
||||
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
|
||||
)
|
||||
|
||||
|
||||
class OCRAPITestCase(BaseAPITestCase):
|
||||
"""
|
||||
Test the OCR app API endpoints
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super(OCRAPITestCase, self).setUp()
|
||||
|
||||
self.admin_user = get_user_model().objects.create_superuser(
|
||||
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
|
||||
password=TEST_ADMIN_PASSWORD
|
||||
)
|
||||
|
||||
self.client.login(
|
||||
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
|
||||
)
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(OCRAPITestCase, self).tearDown()
|
||||
|
||||
def test_submit_document(self):
|
||||
response = self.client.post(
|
||||
reverse(
|
||||
'rest_api:document-ocr-submit-view',
|
||||
args=(self.document.pk,)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
def test_submit_document_version(self):
|
||||
response = self.client.post(
|
||||
reverse(
|
||||
'rest_api:document-version-ocr-submit-view',
|
||||
args=(self.document.latest_version.pk,)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
def test_get_document_version_page_content(self):
|
||||
response = self.client.get(
|
||||
reverse(
|
||||
'rest_api:document-page-content-view',
|
||||
args=(self.document.latest_version.pages.first().pk,)
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in json.loads(response.content)['content']
|
||||
)
|
||||
41
mayan/apps/document_parsing/tests/test_events.py
Normal file
41
mayan/apps/document_parsing/tests/test_events.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from actstream.models import Action
|
||||
|
||||
from documents.tests.test_models import GenericDocumentTestCase
|
||||
|
||||
from ..events import (
|
||||
event_ocr_document_version_submit, event_ocr_document_version_finish
|
||||
)
|
||||
|
||||
|
||||
class OCREventsTestCase(GenericDocumentTestCase):
|
||||
def test_document_version_submit_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.first().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.first().verb,
|
||||
event_ocr_document_version_submit.name
|
||||
)
|
||||
|
||||
def test_document_version_finish_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
from ..models import DocumentVersionOCRError, DocumentPageContent
|
||||
#print DocumentVersionOCRError.objects.all()
|
||||
print DocumentPageContent.objects.all()
|
||||
|
||||
for a in Action.objects.all():
|
||||
print a
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.last().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.last().verb,
|
||||
event_ocr_document_version_finish.name
|
||||
)
|
||||
77
mayan/apps/document_parsing/tests/test_models.py
Normal file
77
mayan/apps/document_parsing/tests/test_models.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.settings import setting_language_choices
|
||||
from documents.tests import (
|
||||
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRTestCase(BaseTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(DocumentOCRTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document.delete()
|
||||
self.document_type.delete()
|
||||
super(DocumentOCRTestCase, self).tearDown()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
|
||||
class GermanOCRSupportTestCase(BaseTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(GermanOCRSupportTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
# Get corresponding language code for German from the default language
|
||||
# choices list
|
||||
language_code = [
|
||||
language for language in setting_language_choices.value if language[1] == 'German'
|
||||
][0][0]
|
||||
|
||||
self.assertEqual('deu', language_code)
|
||||
|
||||
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object, language=language_code
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(GermanOCRSupportTestCase, self).tearDown()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue(
|
||||
'Repository für elektronische Dokumente.' in content
|
||||
)
|
||||
self.assertTrue(
|
||||
'Es bietet einen' in content
|
||||
)
|
||||
83
mayan/apps/document_parsing/tests/test_parsers.py
Normal file
83
mayan/apps/document_parsing/tests/test_parsers.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
from django.test import override_settings
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import (
|
||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
from ..classes import TextExtractor
|
||||
from ..parsers import PDFMinerParser, PopplerParser
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class ParserTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(ParserTestCase, self).setUp()
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(ParserTestCase, self).tearDown()
|
||||
|
||||
def test_pdfminer_parser(self):
|
||||
parser = PDFMinerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
def test_poppler_parser(self):
|
||||
parser = PopplerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class TextExtractorTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TextExtractorTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(TextExtractorTestCase, self).tearDown()
|
||||
|
||||
def test_text_extractor(self):
|
||||
TextExtractor.process_document_version(
|
||||
document_version=self.document.latest_version
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.first().ocr_content.content,
|
||||
'Sample text',
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.last().ocr_content.content,
|
||||
'Sample text in image form',
|
||||
)
|
||||
61
mayan/apps/document_parsing/tests/test_views.py
Normal file
61
mayan/apps/document_parsing/tests/test_views.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.test_views import GenericDocumentViewTestCase
|
||||
|
||||
from ..permissions import permission_ocr_content_view
|
||||
from ..utils import get_document_ocr_content
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=True)
|
||||
class OCRViewsTestCase(GenericDocumentViewTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(OCRViewsTestCase, self).setUp()
|
||||
self.login_user()
|
||||
|
||||
def _document_content_view(self):
|
||||
return self.get(
|
||||
'ocr:document_content', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
def test_document_content_view_no_permissions(self):
|
||||
response = self._document_content_view()
|
||||
|
||||
self.assertEqual(response.status_code, 403)
|
||||
|
||||
def test_document_content_view_with_permission(self):
|
||||
self.grant_permission(permission=permission_ocr_content_view)
|
||||
|
||||
response = self._document_content_view()
|
||||
|
||||
self.assertContains(
|
||||
response, 'Mayan EDMS Documentation', status_code=200
|
||||
)
|
||||
|
||||
def test_document_ocr_download_view_no_permission(self):
|
||||
response = self.get(
|
||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 403)
|
||||
|
||||
def test_document_download_view_with_permission(self):
|
||||
self.expected_content_type = 'application/octet-stream; charset=utf-8'
|
||||
|
||||
self.grant_permission(permission=permission_ocr_content_view)
|
||||
response = self.get(
|
||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
self.assert_download_response(
|
||||
response, content=(
|
||||
''.join(get_document_ocr_content(document=self.document))
|
||||
),
|
||||
)
|
||||
Reference in New Issue
Block a user