Initial commit of the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-23 02:23:14 -04:00
parent 317d07a355
commit e9591c92f9
25 changed files with 1350 additions and 0 deletions

View File

@@ -0,0 +1,88 @@
from __future__ import unicode_literals
import json
from django.contrib.auth import get_user_model
from django.urls import reverse
from rest_framework import status
from documents.models import DocumentType
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
from rest_api.tests import BaseAPITestCase
from user_management.tests import (
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
)
class OCRAPITestCase(BaseAPITestCase):
"""
Test the OCR app API endpoints
"""
def setUp(self):
super(OCRAPITestCase, self).setUp()
self.admin_user = get_user_model().objects.create_superuser(
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
password=TEST_ADMIN_PASSWORD
)
self.client.login(
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
)
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document_type.delete()
super(OCRAPITestCase, self).tearDown()
def test_submit_document(self):
response = self.client.post(
reverse(
'rest_api:document-ocr-submit-view',
args=(self.document.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_submit_document_version(self):
response = self.client.post(
reverse(
'rest_api:document-version-ocr-submit-view',
args=(self.document.latest_version.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_get_document_version_page_content(self):
response = self.client.get(
reverse(
'rest_api:document-page-content-view',
args=(self.document.latest_version.pages.first().pk,)
),
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertTrue(
'Mayan EDMS Documentation' in json.loads(response.content)['content']
)

View File

@@ -0,0 +1,41 @@
from __future__ import unicode_literals
from actstream.models import Action
from documents.tests.test_models import GenericDocumentTestCase
from ..events import (
event_ocr_document_version_submit, event_ocr_document_version_finish
)
class OCREventsTestCase(GenericDocumentTestCase):
def test_document_version_submit_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_ocr_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
from ..models import DocumentVersionOCRError, DocumentPageContent
#print DocumentVersionOCRError.objects.all()
print DocumentPageContent.objects.all()
for a in Action.objects.all():
print a
self.assertEqual(
Action.objects.last().target, self.document.latest_version
)
self.assertEqual(
Action.objects.last().verb,
event_ocr_document_version_finish.name
)

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.settings import setting_language_choices
from documents.tests import (
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
)
class DocumentOCRTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(DocumentOCRTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document.delete()
self.document_type.delete()
super(DocumentOCRTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
class GermanOCRSupportTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(GermanOCRSupportTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
# Get corresponding language code for German from the default language
# choices list
language_code = [
language for language in setting_language_choices.value if language[1] == 'German'
][0][0]
self.assertEqual('deu', language_code)
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object, language=language_code
)
def tearDown(self):
self.document_type.delete()
super(GermanOCRSupportTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue(
'Repository für elektronische Dokumente.' in content
)
self.assertTrue(
'Es bietet einen' in content
)

View File

@@ -0,0 +1,83 @@
from __future__ import unicode_literals
from django.core.files.base import File
from django.test import override_settings
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.tests import (
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
)
from ..classes import TextExtractor
from ..parsers import PDFMinerParser, PopplerParser
@override_settings(OCR_AUTO_OCR=False)
class ParserTestCase(BaseTestCase):
def setUp(self):
super(ParserTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(ParserTestCase, self).tearDown()
def test_pdfminer_parser(self):
parser = PDFMinerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
def test_poppler_parser(self):
parser = PopplerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
@override_settings(OCR_AUTO_OCR=False)
class TextExtractorTestCase(BaseTestCase):
def setUp(self):
super(TextExtractorTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(TextExtractorTestCase, self).tearDown()
def test_text_extractor(self):
TextExtractor.process_document_version(
document_version=self.document.latest_version
)
self.assertEqual(
self.document.latest_version.pages.first().ocr_content.content,
'Sample text',
)
self.assertEqual(
self.document.latest_version.pages.last().ocr_content.content,
'Sample text in image form',
)

View File

@@ -0,0 +1,61 @@
from __future__ import unicode_literals
from django.test import override_settings
from documents.tests.test_views import GenericDocumentViewTestCase
from ..permissions import permission_ocr_content_view
from ..utils import get_document_ocr_content
@override_settings(OCR_AUTO_OCR=True)
class OCRViewsTestCase(GenericDocumentViewTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(OCRViewsTestCase, self).setUp()
self.login_user()
def _document_content_view(self):
return self.get(
'ocr:document_content', args=(self.document.pk,)
)
def test_document_content_view_no_permissions(self):
response = self._document_content_view()
self.assertEqual(response.status_code, 403)
def test_document_content_view_with_permission(self):
self.grant_permission(permission=permission_ocr_content_view)
response = self._document_content_view()
self.assertContains(
response, 'Mayan EDMS Documentation', status_code=200
)
def test_document_ocr_download_view_no_permission(self):
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 403)
def test_document_download_view_with_permission(self):
self.expected_content_type = 'application/octet-stream; charset=utf-8'
self.grant_permission(permission=permission_ocr_content_view)
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 200)
self.assert_download_response(
response, content=(
''.join(get_document_ocr_content(document=self.document))
),
)