Finish the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-24 03:54:07 -04:00
parent e9591c92f9
commit a7eaf6b368
25 changed files with 423 additions and 639 deletions

View File

@@ -3,7 +3,7 @@ from __future__ import unicode_literals
from django.contrib import admin
from .models import (
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
DocumentPageContent, DocumentVersionParseError
)
@@ -12,12 +12,7 @@ class DocumentPageContentAdmin(admin.ModelAdmin):
list_display = ('document_page',)
@admin.register(DocumentTypeSettings)
class DocumentTypeSettingsAdmin(admin.ModelAdmin):
list_display = ('document_type', 'auto_ocr')
@admin.register(DocumentVersionOCRError)
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
@admin.register(DocumentVersionParseError)
class DocumentVersionParseErrorAdmin(admin.ModelAdmin):
list_display = ('document_version', 'datetime_submitted')
readonly_fields = ('document_version', 'datetime_submitted', 'result')

View File

@@ -1,75 +1,19 @@
from __future__ import absolute_import, unicode_literals
from rest_framework import generics, status
from rest_framework import generics
from rest_framework.response import Response
from documents.models import Document, DocumentPage, DocumentVersion
from documents.models import DocumentPage
from rest_api.permissions import MayanPermission
from .models import DocumentPageContent
from .permissions import permission_ocr_content_view, permission_ocr_document
from .permissions import permission_content_view
from .serializers import DocumentPageContentSerializer
class APIDocumentOCRView(generics.GenericAPIView):
mayan_object_permissions = {
'POST': (permission_ocr_document,)
}
permission_classes = (MayanPermission,)
queryset = Document.objects.all()
def get_serializer_class(self):
return None
def post(self, request, *args, **kwargs):
"""
Submit a document for OCR.
---
omit_serializer: true
parameters:
- name: pk
paramType: path
type: number
responseMessages:
- code: 202
message: Accepted
"""
self.get_object().submit_for_ocr()
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentVersionOCRView(generics.GenericAPIView):
mayan_object_permissions = {
'POST': (permission_ocr_document,)
}
permission_classes = (MayanPermission,)
queryset = DocumentVersion.objects.all()
def get_serializer_class(self):
return None
def post(self, request, *args, **kwargs):
"""
Submit a document version for OCR.
---
omit_serializer: true
parameters:
- name: pk
paramType: path
type: number
responseMessages:
- code: 202
message: Accepted
"""
self.get_object().submit_for_ocr()
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentPageContentView(generics.RetrieveAPIView):
"""
Returns the OCR content of the selected document page.
Returns the content of the selected document page.
---
GET:
parameters:
@@ -79,7 +23,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
"""
mayan_object_permissions = {
'GET': (permission_ocr_content_view,),
'GET': (permission_content_view,),
}
permission_classes = (MayanPermission,)
serializer_class = DocumentPageContentSerializer
@@ -89,9 +33,9 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
instance = self.get_object()
try:
ocr_content = instance.ocr_content
content = instance.content
except DocumentPageContent.DoesNotExist:
ocr_content = DocumentPageContent.objects.none()
content = DocumentPageContent.objects.none()
serializer = self.get_serializer(ocr_content)
serializer = self.get_serializer(content)
return Response(serializer.data)

View File

@@ -1,11 +1,12 @@
from __future__ import unicode_literals
from datetime import timedelta
import logging
from kombu import Exchange, Queue
from django.apps import apps
from django.db.models.signals import post_save
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
from acls import ModelPermission
@@ -21,16 +22,38 @@ from mayan.celery import app
from navigation import SourceColumn
from rest_api.classes import APIEndPoint
from .events import event_parsing_document_version_submit
from .handlers import handler_parse_document_version
from .links import (
link_document_content, link_entry_list, link_document_content_errors_list,
link_document_content_download
link_document_content, link_document_content_download,
link_document_parsing_errors_list, link_document_submit_multiple,
link_document_submit, link_document_type_submit, link_error_list
)
from .permissions import permission_content_view
logger = logging.getLogger(__name__)
def document_parsing_submit(self):
latest_version = self.latest_version
# Don't error out if document has no version
if latest_version:
latest_version.submit_for_parsing()
def document_version_parsing_submit(self):
from .tasks import task_parse_document_version
event_parsing_document_version_submit.commit(
action_object=self.document, target=self
)
task_parse_document_version.apply_async(
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
kwargs={'document_version_pk': self.pk},
)
class DocumentParsingApp(MayanAppConfig):
has_tests = True
name = 'document_parsing'
@@ -45,16 +68,17 @@ class DocumentParsingApp(MayanAppConfig):
app_label='documents', model_name='Document'
)
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
DocumentVersion = apps.get_model(
app_label='documents', model_name='DocumentVersion'
)
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
Document.add_to_class('submit_for_parsing', document_parsing_submit)
DocumentVersion.add_to_class(
'submit_for_parsing', document_version_parsing_submit
)
ModelPermission.register(
model=Document, permissions=(permission_content_view,)
)
@@ -72,6 +96,18 @@ class DocumentParsingApp(MayanAppConfig):
attribute='result'
)
app.conf.CELERY_QUEUES.append(
Queue('parsing', Exchange('parsing'), routing_key='parsing'),
)
app.conf.CELERY_ROUTES.update(
{
'document_parsing.tasks.task_parse_document_version': {
'queue': 'parsing'
},
}
)
document_search.add_model_field(
field='versions__pages__content__content', label=_('Content')
)
@@ -89,32 +125,20 @@ class DocumentParsingApp(MayanAppConfig):
menu_object.bind_links(
links=(link_document_submit,), sources=(Document,)
)
menu_object.bind_links(
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
)
menu_secondary.bind_links(
links=(
link_document_content, link_document_ocr_erros_list,
link_document_ocr_download
link_document_content, link_document_parsing_errors_list,
link_document_content_download
),
sources=(
'document_parsing:document_content',
'document_parsing:document_ocr_error_list',
'document_parsing:document_ocr_download',
)
)
menu_secondary.bind_links(
links=(link_entry_list,),
sources=(
'document_parsing:entry_list',
'document_parsing:entry_delete_multiple',
'document_parsing:entry_re_queue_multiple',
DocumentVersionParseError
'document_parsing:document_content_download',
'document_parsing:document_parsing_error_list',
)
)
menu_tools.bind_links(
links=(
link_entry_list
link_document_type_submit, link_error_list,
)
)

View File

@@ -0,0 +1,14 @@
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from events.classes import Event
event_parsing_document_version_submit = Event(
name='parsing_document_version_submit',
label=_('Document version submitted for parsing')
)
event_parsing_document_version_finish = Event(
name='parsing_document_version_finish',
label=_('Document version parsing finished')
)

View File

@@ -1,13 +1,6 @@
from __future__ import unicode_literals
class OCRError(Exception):
"""
Raised by the OCR backend
"""
pass
class ParserError(Exception):
"""
Base exception for file parsers

View File

@@ -6,10 +6,12 @@ from django.utils.html import conditional_escape
from django.utils.safestring import mark_safe
from django.utils.translation import ugettext_lazy as _, ugettext
from acls.models import AccessControlList
from common.widgets import TextAreaDiv
from documents.models import DocumentType
from .models import DocumentPageContent, DocumentPageOCRContent
from .models import DocumentPageContent
from .permissions import permission_parse_document
class DocumentContentForm(forms.Form):
@@ -29,7 +31,7 @@ class DocumentContentForm(forms.Form):
for page in document_pages:
try:
page_content = page.ocr_content.content
page_content = page.content.content
except DocumentPageContent.DoesNotExist:
pass
else:
@@ -55,50 +57,16 @@ class DocumentContentForm(forms.Form):
)
class DocumentOCRContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
document_pages = self.document.pages.all()
except AttributeError:
document_pages = []
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageOCRContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))
content.append(
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
ugettext(
'Page %(page_number)d'
) % {'page_number': page.page_number}
)
)
self.fields['contents'].initial = mark_safe(''.join(content))
contents = forms.CharField(
label=_('Contents'),
widget=TextAreaDiv(
attrs={
'class': 'text_area_div full-height',
'data-height-difference': 360
}
)
)
class DocumentTypeSelectForm(forms.Form):
document_type = forms.ModelChoiceField(
queryset=DocumentType.objects.all(), label=('Document type')
queryset=DocumentType.objects.none(), label=('Document type')
)
def __init__(self, *args, **kwargs):
user = kwargs.pop('user')
super(DocumentTypeSelectForm, self).__init__(*args, **kwargs)
queryset = AccessControlList.objects.filter_by_access(
permission=permission_parse_document,
queryset=DocumentType.objects.all(), user=user,
)
self.fields['document_type'].queryset = queryset

View File

@@ -2,14 +2,8 @@ from __future__ import unicode_literals
import logging
from django.apps import apps
from .settings import setting_auto_ocr
from .parsers import Parser
logger = logging.getLogger(__name__)
def handler_parse_document_version(sender, instance, **kwargs):
if kwargs['created']:
Parser.parse_document_version(document_version=instance)
instance.submit_for_parsing()

View File

@@ -4,24 +4,36 @@ from django.utils.translation import ugettext_lazy as _
from navigation import Link
from .permissions import permission_content_view
from .permissions import permission_content_view, permission_parse_document
link_document_content = Link(
args='resolved_object.id', icon='fa fa-font',
permissions=(permission_content_view,), text=_('Content'),
view='document_parsing:document_content',
)
link_entry_list = Link(
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
text=_('Parsing errors'), view='document_parsing:entry_list'
)
link_document_content_errors_list = Link(
link_document_parsing_errors_list = Link(
args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
view='document_parsing:document_page_parsing_error_list'
permissions=(permission_content_view,), text=_('Parsing errors'),
view='document_parsing:document_parsing_error_list'
)
link_document_content_download = Link(
args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Download content'),
permissions=(permission_content_view,), text=_('Download content'),
view='document_parsing:document_content_download'
)
link_document_submit_multiple = Link(
text=_('Submit for parsing'),
view='document_parsing:document_submit_multiple'
)
link_document_submit = Link(
args='resolved_object.id', permissions=(permission_parse_document,),
text=_('Submit for parsing'), view='document_parsing:document_submit'
)
link_document_type_submit = Link(
icon='fa fa-crosshairs', text=_('Parse documents per type'),
view='document_parsing:document_type_submit'
)
link_error_list = Link(
icon='fa fa-file-text-o', permissions=(permission_content_view,),
text=_('Parsing errors'), view='document_parsing:error_list'
)

View File

@@ -1,14 +1,50 @@
from __future__ import unicode_literals
from datetime import timedelta
import logging
import sys
import traceback
from django.apps import apps
from django.conf import settings
from django.db import models
from django.utils.timezone import now
from .events import event_parsing_document_version_finish
from .parsers import Parser
logger = logging.getLogger(__name__)
class DocumentPageContentManager(models.Manager):
pass
def process_document_version(self, document_version):
logger.info(
'Starting parsing for document version: %s', document_version
)
logger.debug('document version: %d', document_version.pk)
try:
Parser.parse_document_version(document_version=document_version)
except Exception as exception:
logger.exception(
'Parsing error for document version: %d; %s',
document_version.pk, exception,
)
if settings.DEBUG:
result = []
type, value, tb = sys.exc_info()
result.append('%s: %s' % (type.__name__, value))
result.extend(traceback.format_tb(tb))
document_version.parsing_errors.create(
result='\n'.join(result)
)
else:
document_version.parsing_errors.create(result=exception)
else:
logger.info(
'Parsing complete for document version: %s', document_version
)
document_version.parsing_errors.all().delete()
event_parsing_document_version_finish.commit(
action_object=document_version.document,
target=document_version
)

View File

@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-08-23 18:55
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('documents', '0041_auto_20170823_1855'),
]
operations = [
migrations.CreateModel(
name='DocumentPageContent',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('content', models.TextField(blank=True, verbose_name='Content')),
('document_page', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='content', to='documents.DocumentPage', verbose_name='Document page')),
],
options={
'verbose_name': 'Document page content',
'verbose_name_plural': 'Document pages contents',
},
),
migrations.CreateModel(
name='DocumentVersionParseError',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('datetime_submitted', models.DateTimeField(auto_now_add=True, db_index=True, verbose_name='Date time submitted')),
('result', models.TextField(blank=True, null=True, verbose_name='Result')),
('document_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='parse_errors', to='documents.DocumentVersion', verbose_name='Document version')),
],
options={
'ordering': ('datetime_submitted',),
'verbose_name': 'Document version parse error',
'verbose_name_plural': 'Document version parse errors',
},
),
]

View File

@@ -4,7 +4,7 @@ from django.db import models
from django.utils.encoding import force_text, python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _
from documents.models import DocumentPage, DocumentType, DocumentVersion
from documents.models import DocumentPage, DocumentVersion
from .managers import DocumentPageContentManager
@@ -30,11 +30,11 @@ class DocumentPageContent(models.Model):
@python_2_unicode_compatible
class DocumentVersionParseError(models.Model):
document_version = models.ForeignKey(
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
verbose_name=_('Document version')
DocumentVersion, on_delete=models.CASCADE,
related_name='parsing_errors', verbose_name=_('Document version')
)
datetime_submitted = models.DateTimeField(
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
)
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))

View File

@@ -1,20 +1,15 @@
from __future__ import unicode_literals
from io import BytesIO
import logging
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import subprocess
from django.apps import apps
from django.utils.translation import ugettext_lazy as _
from common.utils import copyfile, fs_cleanup, mkstemp
from .exceptions import ParserError, NoMIMETypeMatch
from .models import DocumentPageContent
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
@@ -82,6 +77,10 @@ class Parser(object):
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
DocumentPageContent = apps.get_model(
app_label='document_parsing', model_name='DocumentPageContent'
)
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
@@ -171,32 +170,7 @@ class PopplerParser(Parser):
return output
class PDFMinerParser(Parser):
"""
Parser for PDF files using the PDFMiner library for Python
"""
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
with BytesIO() as string_buffer:
rsrcmgr = PDFResourceManager()
device = TextConverter(
rsrcmgr, outfp=string_buffer, laparams=LAParams()
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page = PDFPage.get_pages(
file_object, maxpages=1, pagenos=(page_number - 1,)
)
interpreter.process_page(page.next())
device.close()
logger.debug('Finished parsing PDF: %d', page_number)
return string_buffer.getvalue()
Parser.register(
mimetypes=('application/pdf',),
parser_classes=(PopplerParser, PDFMinerParser)
parser_classes=(PopplerParser,)
)

View File

@@ -9,3 +9,7 @@ namespace = PermissionNamespace('document_parsing', _('Document parsing'))
permission_content_view = namespace.add_permission(
name='content_view', label=_('View the content of a document')
)
permission_parse_document = namespace.add_permission(
name='parse_document', label=_('Parse the content of a document')
)

View File

@@ -4,7 +4,8 @@ from django.utils.translation import ugettext_lazy as _
from task_manager.classes import CeleryQueue
queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
queue_ocr = CeleryQueue(name='parsing', label=_('Parsing'))
queue_ocr.add_task_type(
name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
name='document_parsing.tasks.task_parse_document_version',
label=_('Document version parsing')
)

View File

@@ -0,0 +1,29 @@
from __future__ import unicode_literals
import logging
from django.apps import apps
from mayan.celery import app
logger = logging.getLogger(__name__)
@app.task(ignore_result=True)
def task_parse_document_version(document_version_pk):
DocumentVersion = apps.get_model(
app_label='documents', model_name='DocumentVersion'
)
DocumentPageContent = apps.get_model(
app_label='document_parsing', model_name='DocumentPageContent'
)
document_version = DocumentVersion.objects.get(
pk=document_version_pk
)
logger.info(
'Starting parsing for document version: %s', document_version
)
DocumentPageContent.objects.process_document_version(
document_version=document_version
)

View File

@@ -1,88 +0,0 @@
from __future__ import unicode_literals
import json
from django.contrib.auth import get_user_model
from django.urls import reverse
from rest_framework import status
from documents.models import DocumentType
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
from rest_api.tests import BaseAPITestCase
from user_management.tests import (
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
)
class OCRAPITestCase(BaseAPITestCase):
"""
Test the OCR app API endpoints
"""
def setUp(self):
super(OCRAPITestCase, self).setUp()
self.admin_user = get_user_model().objects.create_superuser(
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
password=TEST_ADMIN_PASSWORD
)
self.client.login(
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
)
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document_type.delete()
super(OCRAPITestCase, self).tearDown()
def test_submit_document(self):
response = self.client.post(
reverse(
'rest_api:document-ocr-submit-view',
args=(self.document.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_submit_document_version(self):
response = self.client.post(
reverse(
'rest_api:document-version-ocr-submit-view',
args=(self.document.latest_version.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_get_document_version_page_content(self):
response = self.client.get(
reverse(
'rest_api:document-page-content-view',
args=(self.document.latest_version.pages.first().pk,)
),
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertTrue(
'Mayan EDMS Documentation' in json.loads(response.content)['content']
)

View File

@@ -2,40 +2,38 @@ from __future__ import unicode_literals
from actstream.models import Action
from documents.tests.literals import TEST_DOCUMENT_FILENAME
from documents.tests.test_models import GenericDocumentTestCase
from ..events import (
event_ocr_document_version_submit, event_ocr_document_version_finish
event_parsing_document_version_submit,
event_parsing_document_version_finish
)
class OCREventsTestCase(GenericDocumentTestCase):
class DocumentParsingEventsTestCase(GenericDocumentTestCase):
# Ensure we use a PDF file
test_document_filename = TEST_DOCUMENT_FILENAME
def test_document_version_submit_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_ocr_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
from ..models import DocumentVersionOCRError, DocumentPageContent
#print DocumentVersionOCRError.objects.all()
print DocumentPageContent.objects.all()
for a in Action.objects.all():
print a
self.document.submit_for_parsing()
self.assertEqual(
Action.objects.last().target, self.document.latest_version
)
self.assertEqual(
Action.objects.last().verb,
event_ocr_document_version_finish.name
event_parsing_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_parsing()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_parsing_document_version_finish.name
)

View File

@@ -1,77 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.settings import setting_language_choices
from documents.tests import (
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
)
class DocumentOCRTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(DocumentOCRTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document.delete()
self.document_type.delete()
super(DocumentOCRTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
class GermanOCRSupportTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(GermanOCRSupportTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
# Get corresponding language code for German from the default language
# choices list
language_code = [
language for language in setting_language_choices.value if language[1] == 'German'
][0][0]
self.assertEqual('deu', language_code)
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object, language=language_code
)
def tearDown(self):
self.document_type.delete()
super(GermanOCRSupportTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue(
'Repository für elektronische Dokumente.' in content
)
self.assertTrue(
'Es bietet einen' in content
)

View File

@@ -5,12 +5,9 @@ from django.test import override_settings
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.tests import (
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
)
from documents.tests import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL
from ..classes import TextExtractor
from ..parsers import PDFMinerParser, PopplerParser
from ..parsers import PopplerParser
@override_settings(OCR_AUTO_OCR=False)
@@ -30,54 +27,11 @@ class ParserTestCase(BaseTestCase):
self.document_type.delete()
super(ParserTestCase, self).tearDown()
def test_pdfminer_parser(self):
parser = PDFMinerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
def test_poppler_parser(self):
parser = PopplerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
@override_settings(OCR_AUTO_OCR=False)
class TextExtractorTestCase(BaseTestCase):
def setUp(self):
super(TextExtractorTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(TextExtractorTestCase, self).tearDown()
def test_text_extractor(self):
TextExtractor.process_document_version(
document_version=self.document.latest_version
)
self.assertEqual(
self.document.latest_version.pages.first().ocr_content.content,
'Sample text',
)
self.assertEqual(
self.document.latest_version.pages.last().ocr_content.content,
'Sample text in image form',
'Mayan EDMS Documentation' in self.document.pages.first().content.content
)

View File

@@ -1,26 +1,25 @@
from __future__ import unicode_literals
from django.test import override_settings
from documents.tests.literals import TEST_DOCUMENT_FILENAME
from documents.tests.test_views import GenericDocumentViewTestCase
from ..permissions import permission_ocr_content_view
from ..utils import get_document_ocr_content
from ..permissions import permission_content_view
from ..utils import get_document_content
@override_settings(OCR_AUTO_OCR=True)
class OCRViewsTestCase(GenericDocumentViewTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
class DocumentContentViewsTestCase(GenericDocumentViewTestCase):
_skip_file_descriptor_test = True
# Ensure we use a PDF file
test_document_filename = TEST_DOCUMENT_FILENAME
def setUp(self):
super(OCRViewsTestCase, self).setUp()
super(DocumentContentViewsTestCase, self).setUp()
self.login_user()
def _document_content_view(self):
return self.get(
'ocr:document_content', args=(self.document.pk,)
'document_parsing:document_content', args=(self.document.pk,)
)
def test_document_content_view_no_permissions(self):
@@ -29,7 +28,7 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
self.assertEqual(response.status_code, 403)
def test_document_content_view_with_permission(self):
self.grant_permission(permission=permission_ocr_content_view)
self.grant_permission(permission=permission_content_view)
response = self._document_content_view()
@@ -37,25 +36,25 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
response, 'Mayan EDMS Documentation', status_code=200
)
def test_document_ocr_download_view_no_permission(self):
def test_document_parsing_download_view_no_permission(self):
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
'document_parsing:document_content_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 403)
def test_document_download_view_with_permission(self):
def test_download_view_with_permission(self):
self.expected_content_type = 'application/octet-stream; charset=utf-8'
self.grant_permission(permission=permission_ocr_content_view)
self.grant_permission(permission=permission_content_view)
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
'document_parsing:document_content_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 200)
self.assert_download_response(
response, content=(
''.join(get_document_ocr_content(document=self.document))
''.join(get_document_content(document=self.document))
),
)

View File

@@ -2,62 +2,43 @@ from __future__ import unicode_literals
from django.conf.urls import url
from .api_views import (
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
)
from .api_views import APIDocumentPageContentView
from .views import (
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
DocumentContentView, DocumentContentDownloadView,
DocumentParsingErrorsListView, DocumentSubmitView, DocumentTypeSubmitView,
ParseErrorListView
)
urlpatterns = [
url(
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
r'^documents/(?P<pk>\d+)/content/$', DocumentContentView.as_view(),
name='document_content'
),
url(
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
name='document_submit'
r'^documents/(?P<pk>\d+)/content/download/$',
DocumentContentDownloadView.as_view(), name='document_content_download'
),
url(
r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
name='document_submit_all'
),
url(
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
r'^document_types/submit/$', DocumentTypeSubmitView.as_view(),
name='document_type_submit'
),
url(
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
r'^documents/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
name='document_submit'
),
url(
r'^documents/multiple/submit/$', DocumentSubmitView.as_view(),
name='document_submit_multiple'
),
url(
r'^document_type/(?P<pk>\d+)/ocr/settings/$',
DocumentTypeSettingsEditView.as_view(),
name='document_type_ocr_settings'
r'^documents/(?P<pk>\d+)/errors/$',
DocumentParsingErrorsListView.as_view(),
name='document_parsing_error_list'
),
url(
r'^documents/(?P<pk>\d+)/ocr/errors/$',
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
),
url(
r'^documents/(?P<pk>\d+)/ocr/download/$',
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
),
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
url(r'^errors/all/$', ParseErrorListView.as_view(), name='error_list'),
]
api_urls = [
url(
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
name='document-ocr-submit-view'
),
url(
r'^document_version/(?P<pk>\d+)/submit/$',
APIDocumentVersionOCRView.as_view(),
name='document-version-ocr-submit-view'
),
url(
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
name='document-page-content-view'

View File

@@ -6,10 +6,10 @@ from django.utils.html import conditional_escape
from .models import DocumentPageContent
def get_document_ocr_content(document):
def get_document_content(document):
for page in document.pages.all():
try:
page_content = page.ocr_content.content
page_content = page.content.content
except DocumentPageContent.DoesNotExist:
pass
else:

View File

@@ -4,137 +4,27 @@ from django.contrib import messages
from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404
from django.urls import reverse
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext_lazy as _, ungettext
from acls.models import AccessControlList
from common.generics import (
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
SingleObjectEditView, SingleObjectListView
FormView, MultipleObjectConfirmActionView, SingleObjectDetailView,
SingleObjectDownloadView, SingleObjectListView
)
from common.mixins import MultipleInstanceActionMixin
from documents.models import Document, DocumentType
from documents.models import Document
from .forms import DocumentContentForm, DocumentTypeSelectForm
from .models import DocumentVersionOCRError
from .permissions import (
permission_ocr_content_view, permission_ocr_document,
permission_document_type_ocr_setup
)
from .utils import get_document_ocr_content
from .models import DocumentVersionParseError
from .permissions import permission_content_view, permission_parse_document
from .utils import get_document_content
class DocumentAllSubmitView(ConfirmView):
extra_context = {'title': _('Submit all documents for OCR?')}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def view_action(self):
count = 0
for document in Document.objects.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _('%d documents added to the OCR queue.') % count
)
class DocumentSubmitView(ConfirmView):
def get_extra_context(self):
return {
'object': self.get_object(),
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
}
def get_object(self):
return Document.objects.get(pk=self.kwargs['pk'])
def object_action(self, instance):
AccessControlList.objects.check_access(
permissions=permission_ocr_document, user=self.request.user,
obj=instance
)
instance.submit_for_ocr()
def view_action(self):
instance = self.get_object()
self.object_action(instance=instance)
messages.success(
self.request,
_('Document: %(document)s was added to the OCR queue.') % {
'document': instance
}
)
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
model = Document
success_message = '%(count)d document submitted to the OCR queue.'
success_message_plural = '%(count)d documents submitted to the OCR queue.'
def get_extra_context(self):
# Override the base class method
return {
'title': _('Submit the selected documents to the OCR queue?')
}
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for OCR')
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'OCR queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
)
return HttpResponseRedirect(self.get_success_url())
class DocumentTypeSettingsEditView(SingleObjectEditView):
fields = ('auto_ocr',)
view_permission = permission_document_type_ocr_setup
def get_object(self, queryset=None):
return get_object_or_404(
DocumentType, pk=self.kwargs['pk']
).ocr_settings
def get_extra_context(self):
return {
'title': _(
'Edit OCR settings for document type: %s'
) % self.get_object().document_type
}
class DocumentOCRContent(SingleObjectDetailView):
class DocumentContentView(SingleObjectDetailView):
form_class = DocumentContentForm
model = Document
object_permission = permission_ocr_content_view
object_permission = permission_content_view
def dispatch(self, request, *args, **kwargs):
result = super(DocumentOCRContent, self).dispatch(
result = super(DocumentContentView, self).dispatch(
request, *args, **kwargs
)
self.get_object().add_as_recent_document_for_user(request.user)
@@ -145,23 +35,25 @@ class DocumentOCRContent(SingleObjectDetailView):
'document': self.get_object(),
'hide_labels': True,
'object': self.get_object(),
'title': _('OCR result for document: %s') % self.get_object(),
'title': _('Content for document: %s') % self.get_object(),
}
class EntryListView(SingleObjectListView):
extra_context = {
'hide_object': True,
'title': _('OCR errors'),
}
view_permission = permission_ocr_document
class DocumentContentDownloadView(SingleObjectDownloadView):
model = Document
object_permission = permission_content_view
def get_object_list(self):
return DocumentVersionOCRError.objects.all()
def get_file(self):
file_object = DocumentContentDownloadView.TextIteratorIO(
iterator=get_document_content(document=self.get_object())
)
return DocumentContentDownloadView.VirtualFile(
file=file_object, name='{}-content'.format(self.get_object())
)
class DocumentOCRErrorsListView(SingleObjectListView):
view_permission = permission_ocr_document
class DocumentParsingErrorsListView(SingleObjectListView):
view_permission = permission_content_view
def get_document(self):
return get_object_or_404(Document, pk=self.kwargs['pk'])
@@ -170,21 +62,93 @@ class DocumentOCRErrorsListView(SingleObjectListView):
return {
'hide_object': True,
'object': self.get_document(),
'title': _('OCR errors for document: %s') % self.get_document(),
'title': _(
'Parsing errors for document: %s'
) % self.get_document(),
}
def get_object_list(self):
return self.get_document().latest_version.ocr_errors.all()
return self.get_document().latest_version.parsing_errors.all()
class DocumentOCRDownloadView(SingleObjectDownloadView):
class DocumentSubmitView(MultipleObjectConfirmActionView):
model = Document
object_permission = permission_ocr_content_view
object_permission = permission_parse_document
success_message = _(
'%(count)d document added to the parsing queue'
)
success_message_plural = _(
'%(count)d documents added to the parsing queue'
)
def get_file(self):
file_object = DocumentOCRDownloadView.TextIteratorIO(
iterator=get_document_ocr_content(document=self.get_object())
)
return DocumentOCRDownloadView.VirtualFile(
file=file_object, name='{}-OCR'.format(self.get_object())
def get_extra_context(self):
queryset = self.get_queryset()
result = {
'title': ungettext(
singular='Submit %(count)d document to the parsing queue?',
plural='Submit %(count)d documents to the parsing queue',
number=queryset.count()
) % {
'count': queryset.count(),
}
}
if queryset.count() == 1:
result.update(
{
'object': queryset.first(),
'title': _(
'Submit document "%s" to the parsing queue'
) % queryset.first()
}
)
return result
def object_action(self, instance, form=None):
instance.submit_for_parsing()
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for parsing')
}
def get_form_extra_kwargs(self):
return {
'user': self.request.user
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_parsing()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'parsing queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
)
return HttpResponseRedirect(self.get_success_url())
class ParseErrorListView(SingleObjectListView):
extra_context = {
'hide_object': True,
'title': _('Parsing errors'),
}
view_permission = permission_content_view
def get_object_list(self):
return DocumentVersionParseError.objects.all()

View File

@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-08-23 18:55
from __future__ import unicode_literals
from django.db import migrations, models
import uuid
class Migration(migrations.Migration):
dependencies = [
('documents', '0040_auto_20170725_1111'),
]
operations = [
migrations.AlterField(
model_name='document',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, editable=False),
),
]