Finish the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-24 03:54:07 -04:00
parent e9591c92f9
commit a7eaf6b368
25 changed files with 423 additions and 639 deletions

View File

@@ -3,7 +3,7 @@ from __future__ import unicode_literals
from django.contrib import admin from django.contrib import admin
from .models import ( from .models import (
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError DocumentPageContent, DocumentVersionParseError
) )
@@ -12,12 +12,7 @@ class DocumentPageContentAdmin(admin.ModelAdmin):
list_display = ('document_page',) list_display = ('document_page',)
@admin.register(DocumentTypeSettings) @admin.register(DocumentVersionParseError)
class DocumentTypeSettingsAdmin(admin.ModelAdmin): class DocumentVersionParseErrorAdmin(admin.ModelAdmin):
list_display = ('document_type', 'auto_ocr')
@admin.register(DocumentVersionOCRError)
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
list_display = ('document_version', 'datetime_submitted') list_display = ('document_version', 'datetime_submitted')
readonly_fields = ('document_version', 'datetime_submitted', 'result') readonly_fields = ('document_version', 'datetime_submitted', 'result')

View File

@@ -1,75 +1,19 @@
from __future__ import absolute_import, unicode_literals from __future__ import absolute_import, unicode_literals
from rest_framework import generics, status from rest_framework import generics
from rest_framework.response import Response from rest_framework.response import Response
from documents.models import Document, DocumentPage, DocumentVersion from documents.models import DocumentPage
from rest_api.permissions import MayanPermission from rest_api.permissions import MayanPermission
from .models import DocumentPageContent from .models import DocumentPageContent
from .permissions import permission_ocr_content_view, permission_ocr_document from .permissions import permission_content_view
from .serializers import DocumentPageContentSerializer from .serializers import DocumentPageContentSerializer
class APIDocumentOCRView(generics.GenericAPIView):
mayan_object_permissions = {
'POST': (permission_ocr_document,)
}
permission_classes = (MayanPermission,)
queryset = Document.objects.all()
def get_serializer_class(self):
return None
def post(self, request, *args, **kwargs):
"""
Submit a document for OCR.
---
omit_serializer: true
parameters:
- name: pk
paramType: path
type: number
responseMessages:
- code: 202
message: Accepted
"""
self.get_object().submit_for_ocr()
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentVersionOCRView(generics.GenericAPIView):
mayan_object_permissions = {
'POST': (permission_ocr_document,)
}
permission_classes = (MayanPermission,)
queryset = DocumentVersion.objects.all()
def get_serializer_class(self):
return None
def post(self, request, *args, **kwargs):
"""
Submit a document version for OCR.
---
omit_serializer: true
parameters:
- name: pk
paramType: path
type: number
responseMessages:
- code: 202
message: Accepted
"""
self.get_object().submit_for_ocr()
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentPageContentView(generics.RetrieveAPIView): class APIDocumentPageContentView(generics.RetrieveAPIView):
""" """
Returns the OCR content of the selected document page. Returns the content of the selected document page.
--- ---
GET: GET:
parameters: parameters:
@@ -79,7 +23,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
""" """
mayan_object_permissions = { mayan_object_permissions = {
'GET': (permission_ocr_content_view,), 'GET': (permission_content_view,),
} }
permission_classes = (MayanPermission,) permission_classes = (MayanPermission,)
serializer_class = DocumentPageContentSerializer serializer_class = DocumentPageContentSerializer
@@ -89,9 +33,9 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
instance = self.get_object() instance = self.get_object()
try: try:
ocr_content = instance.ocr_content content = instance.content
except DocumentPageContent.DoesNotExist: except DocumentPageContent.DoesNotExist:
ocr_content = DocumentPageContent.objects.none() content = DocumentPageContent.objects.none()
serializer = self.get_serializer(ocr_content) serializer = self.get_serializer(content)
return Response(serializer.data) return Response(serializer.data)

View File

@@ -1,11 +1,12 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from datetime import timedelta
import logging import logging
from kombu import Exchange, Queue from kombu import Exchange, Queue
from django.apps import apps from django.apps import apps
from django.db.models.signals import post_save from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from acls import ModelPermission from acls import ModelPermission
@@ -21,16 +22,38 @@ from mayan.celery import app
from navigation import SourceColumn from navigation import SourceColumn
from rest_api.classes import APIEndPoint from rest_api.classes import APIEndPoint
from .events import event_parsing_document_version_submit
from .handlers import handler_parse_document_version from .handlers import handler_parse_document_version
from .links import ( from .links import (
link_document_content, link_entry_list, link_document_content_errors_list, link_document_content, link_document_content_download,
link_document_content_download link_document_parsing_errors_list, link_document_submit_multiple,
link_document_submit, link_document_type_submit, link_error_list
) )
from .permissions import permission_content_view from .permissions import permission_content_view
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def document_parsing_submit(self):
latest_version = self.latest_version
# Don't error out if document has no version
if latest_version:
latest_version.submit_for_parsing()
def document_version_parsing_submit(self):
from .tasks import task_parse_document_version
event_parsing_document_version_submit.commit(
action_object=self.document, target=self
)
task_parse_document_version.apply_async(
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
kwargs={'document_version_pk': self.pk},
)
class DocumentParsingApp(MayanAppConfig): class DocumentParsingApp(MayanAppConfig):
has_tests = True has_tests = True
name = 'document_parsing' name = 'document_parsing'
@@ -45,16 +68,17 @@ class DocumentParsingApp(MayanAppConfig):
app_label='documents', model_name='Document' app_label='documents', model_name='Document'
) )
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
DocumentVersion = apps.get_model( DocumentVersion = apps.get_model(
app_label='documents', model_name='DocumentVersion' app_label='documents', model_name='DocumentVersion'
) )
DocumentVersionParseError = self.get_model('DocumentVersionParseError') DocumentVersionParseError = self.get_model('DocumentVersionParseError')
Document.add_to_class('submit_for_parsing', document_parsing_submit)
DocumentVersion.add_to_class(
'submit_for_parsing', document_version_parsing_submit
)
ModelPermission.register( ModelPermission.register(
model=Document, permissions=(permission_content_view,) model=Document, permissions=(permission_content_view,)
) )
@@ -72,6 +96,18 @@ class DocumentParsingApp(MayanAppConfig):
attribute='result' attribute='result'
) )
app.conf.CELERY_QUEUES.append(
Queue('parsing', Exchange('parsing'), routing_key='parsing'),
)
app.conf.CELERY_ROUTES.update(
{
'document_parsing.tasks.task_parse_document_version': {
'queue': 'parsing'
},
}
)
document_search.add_model_field( document_search.add_model_field(
field='versions__pages__content__content', label=_('Content') field='versions__pages__content__content', label=_('Content')
) )
@@ -89,32 +125,20 @@ class DocumentParsingApp(MayanAppConfig):
menu_object.bind_links( menu_object.bind_links(
links=(link_document_submit,), sources=(Document,) links=(link_document_submit,), sources=(Document,)
) )
menu_object.bind_links(
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
)
menu_secondary.bind_links( menu_secondary.bind_links(
links=( links=(
link_document_content, link_document_ocr_erros_list, link_document_content, link_document_parsing_errors_list,
link_document_ocr_download link_document_content_download
), ),
sources=( sources=(
'document_parsing:document_content', 'document_parsing:document_content',
'document_parsing:document_ocr_error_list', 'document_parsing:document_content_download',
'document_parsing:document_ocr_download', 'document_parsing:document_parsing_error_list',
)
)
menu_secondary.bind_links(
links=(link_entry_list,),
sources=(
'document_parsing:entry_list',
'document_parsing:entry_delete_multiple',
'document_parsing:entry_re_queue_multiple',
DocumentVersionParseError
) )
) )
menu_tools.bind_links( menu_tools.bind_links(
links=( links=(
link_entry_list link_document_type_submit, link_error_list,
) )
) )

View File

@@ -0,0 +1,14 @@
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from events.classes import Event
event_parsing_document_version_submit = Event(
name='parsing_document_version_submit',
label=_('Document version submitted for parsing')
)
event_parsing_document_version_finish = Event(
name='parsing_document_version_finish',
label=_('Document version parsing finished')
)

View File

@@ -1,13 +1,6 @@
from __future__ import unicode_literals from __future__ import unicode_literals
class OCRError(Exception):
"""
Raised by the OCR backend
"""
pass
class ParserError(Exception): class ParserError(Exception):
""" """
Base exception for file parsers Base exception for file parsers

View File

@@ -6,10 +6,12 @@ from django.utils.html import conditional_escape
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
from django.utils.translation import ugettext_lazy as _, ugettext from django.utils.translation import ugettext_lazy as _, ugettext
from acls.models import AccessControlList
from common.widgets import TextAreaDiv from common.widgets import TextAreaDiv
from documents.models import DocumentType from documents.models import DocumentType
from .models import DocumentPageContent, DocumentPageOCRContent from .models import DocumentPageContent
from .permissions import permission_parse_document
class DocumentContentForm(forms.Form): class DocumentContentForm(forms.Form):
@@ -29,7 +31,7 @@ class DocumentContentForm(forms.Form):
for page in document_pages: for page in document_pages:
try: try:
page_content = page.ocr_content.content page_content = page.content.content
except DocumentPageContent.DoesNotExist: except DocumentPageContent.DoesNotExist:
pass pass
else: else:
@@ -55,50 +57,16 @@ class DocumentContentForm(forms.Form):
) )
class DocumentOCRContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
document_pages = self.document.pages.all()
except AttributeError:
document_pages = []
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageOCRContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))
content.append(
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
ugettext(
'Page %(page_number)d'
) % {'page_number': page.page_number}
)
)
self.fields['contents'].initial = mark_safe(''.join(content))
contents = forms.CharField(
label=_('Contents'),
widget=TextAreaDiv(
attrs={
'class': 'text_area_div full-height',
'data-height-difference': 360
}
)
)
class DocumentTypeSelectForm(forms.Form): class DocumentTypeSelectForm(forms.Form):
document_type = forms.ModelChoiceField( document_type = forms.ModelChoiceField(
queryset=DocumentType.objects.all(), label=('Document type') queryset=DocumentType.objects.none(), label=('Document type')
) )
def __init__(self, *args, **kwargs):
user = kwargs.pop('user')
super(DocumentTypeSelectForm, self).__init__(*args, **kwargs)
queryset = AccessControlList.objects.filter_by_access(
permission=permission_parse_document,
queryset=DocumentType.objects.all(), user=user,
)
self.fields['document_type'].queryset = queryset

View File

@@ -2,14 +2,8 @@ from __future__ import unicode_literals
import logging import logging
from django.apps import apps
from .settings import setting_auto_ocr
from .parsers import Parser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
def handler_parse_document_version(sender, instance, **kwargs): def handler_parse_document_version(sender, instance, **kwargs):
if kwargs['created']: instance.submit_for_parsing()
Parser.parse_document_version(document_version=instance)

View File

@@ -4,24 +4,36 @@ from django.utils.translation import ugettext_lazy as _
from navigation import Link from navigation import Link
from .permissions import permission_content_view from .permissions import permission_content_view, permission_parse_document
link_document_content = Link( link_document_content = Link(
args='resolved_object.id', icon='fa fa-font', args='resolved_object.id', icon='fa fa-font',
permissions=(permission_content_view,), text=_('Content'), permissions=(permission_content_view,), text=_('Content'),
view='document_parsing:document_content', view='document_parsing:document_content',
) )
link_entry_list = Link( link_document_parsing_errors_list = Link(
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
text=_('Parsing errors'), view='document_parsing:entry_list'
)
link_document_content_errors_list = Link(
args='resolved_object.id', icon='fa fa-file-text-o', args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Parsing errors'), permissions=(permission_content_view,), text=_('Parsing errors'),
view='document_parsing:document_page_parsing_error_list' view='document_parsing:document_parsing_error_list'
) )
link_document_content_download = Link( link_document_content_download = Link(
args='resolved_object.id', icon='fa fa-file-text-o', args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Download content'), permissions=(permission_content_view,), text=_('Download content'),
view='document_parsing:document_content_download' view='document_parsing:document_content_download'
) )
link_document_submit_multiple = Link(
text=_('Submit for parsing'),
view='document_parsing:document_submit_multiple'
)
link_document_submit = Link(
args='resolved_object.id', permissions=(permission_parse_document,),
text=_('Submit for parsing'), view='document_parsing:document_submit'
)
link_document_type_submit = Link(
icon='fa fa-crosshairs', text=_('Parse documents per type'),
view='document_parsing:document_type_submit'
)
link_error_list = Link(
icon='fa fa-file-text-o', permissions=(permission_content_view,),
text=_('Parsing errors'), view='document_parsing:error_list'
)

View File

@@ -1,14 +1,50 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from datetime import timedelta
import logging import logging
import sys
import traceback
from django.apps import apps from django.conf import settings
from django.db import models from django.db import models
from django.utils.timezone import now
from .events import event_parsing_document_version_finish
from .parsers import Parser
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class DocumentPageContentManager(models.Manager): class DocumentPageContentManager(models.Manager):
pass def process_document_version(self, document_version):
logger.info(
'Starting parsing for document version: %s', document_version
)
logger.debug('document version: %d', document_version.pk)
try:
Parser.parse_document_version(document_version=document_version)
except Exception as exception:
logger.exception(
'Parsing error for document version: %d; %s',
document_version.pk, exception,
)
if settings.DEBUG:
result = []
type, value, tb = sys.exc_info()
result.append('%s: %s' % (type.__name__, value))
result.extend(traceback.format_tb(tb))
document_version.parsing_errors.create(
result='\n'.join(result)
)
else:
document_version.parsing_errors.create(result=exception)
else:
logger.info(
'Parsing complete for document version: %s', document_version
)
document_version.parsing_errors.all().delete()
event_parsing_document_version_finish.commit(
action_object=document_version.document,
target=document_version
)

View File

@@ -0,0 +1,44 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-08-23 18:55
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
initial = True
dependencies = [
('documents', '0041_auto_20170823_1855'),
]
operations = [
migrations.CreateModel(
name='DocumentPageContent',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('content', models.TextField(blank=True, verbose_name='Content')),
('document_page', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='content', to='documents.DocumentPage', verbose_name='Document page')),
],
options={
'verbose_name': 'Document page content',
'verbose_name_plural': 'Document pages contents',
},
),
migrations.CreateModel(
name='DocumentVersionParseError',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('datetime_submitted', models.DateTimeField(auto_now_add=True, db_index=True, verbose_name='Date time submitted')),
('result', models.TextField(blank=True, null=True, verbose_name='Result')),
('document_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='parse_errors', to='documents.DocumentVersion', verbose_name='Document version')),
],
options={
'ordering': ('datetime_submitted',),
'verbose_name': 'Document version parse error',
'verbose_name_plural': 'Document version parse errors',
},
),
]

View File

@@ -4,7 +4,7 @@ from django.db import models
from django.utils.encoding import force_text, python_2_unicode_compatible from django.utils.encoding import force_text, python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from documents.models import DocumentPage, DocumentType, DocumentVersion from documents.models import DocumentPage, DocumentVersion
from .managers import DocumentPageContentManager from .managers import DocumentPageContentManager
@@ -30,11 +30,11 @@ class DocumentPageContent(models.Model):
@python_2_unicode_compatible @python_2_unicode_compatible
class DocumentVersionParseError(models.Model): class DocumentVersionParseError(models.Model):
document_version = models.ForeignKey( document_version = models.ForeignKey(
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors', DocumentVersion, on_delete=models.CASCADE,
verbose_name=_('Document version') related_name='parsing_errors', verbose_name=_('Document version')
) )
datetime_submitted = models.DateTimeField( datetime_submitted = models.DateTimeField(
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted') auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
) )
result = models.TextField(blank=True, null=True, verbose_name=_('Result')) result = models.TextField(blank=True, null=True, verbose_name=_('Result'))

View File

@@ -1,20 +1,15 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from io import BytesIO
import logging import logging
import os import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import subprocess import subprocess
from django.apps import apps
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from common.utils import copyfile, fs_cleanup, mkstemp from common.utils import copyfile, fs_cleanup, mkstemp
from .exceptions import ParserError, NoMIMETypeMatch from .exceptions import ParserError, NoMIMETypeMatch
from .models import DocumentPageContent
from .settings import setting_pdftotext_path from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -82,6 +77,10 @@ class Parser(object):
self.process_document_page(document_page=document_page) self.process_document_page(document_page=document_page)
def process_document_page(self, document_page): def process_document_page(self, document_page):
DocumentPageContent = apps.get_model(
app_label='document_parsing', model_name='DocumentPageContent'
)
logger.info( logger.info(
'Processing page: %d of document version: %s', 'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version document_page.page_number, document_page.document_version
@@ -171,32 +170,7 @@ class PopplerParser(Parser):
return output return output
class PDFMinerParser(Parser):
"""
Parser for PDF files using the PDFMiner library for Python
"""
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
with BytesIO() as string_buffer:
rsrcmgr = PDFResourceManager()
device = TextConverter(
rsrcmgr, outfp=string_buffer, laparams=LAParams()
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page = PDFPage.get_pages(
file_object, maxpages=1, pagenos=(page_number - 1,)
)
interpreter.process_page(page.next())
device.close()
logger.debug('Finished parsing PDF: %d', page_number)
return string_buffer.getvalue()
Parser.register( Parser.register(
mimetypes=('application/pdf',), mimetypes=('application/pdf',),
parser_classes=(PopplerParser, PDFMinerParser) parser_classes=(PopplerParser,)
) )

View File

@@ -9,3 +9,7 @@ namespace = PermissionNamespace('document_parsing', _('Document parsing'))
permission_content_view = namespace.add_permission( permission_content_view = namespace.add_permission(
name='content_view', label=_('View the content of a document') name='content_view', label=_('View the content of a document')
) )
permission_parse_document = namespace.add_permission(
name='parse_document', label=_('Parse the content of a document')
)

View File

@@ -4,7 +4,8 @@ from django.utils.translation import ugettext_lazy as _
from task_manager.classes import CeleryQueue from task_manager.classes import CeleryQueue
queue_ocr = CeleryQueue(name='ocr', label=_('OCR')) queue_ocr = CeleryQueue(name='parsing', label=_('Parsing'))
queue_ocr.add_task_type( queue_ocr.add_task_type(
name='ocr.tasks.task_do_ocr', label=_('Document version OCR') name='document_parsing.tasks.task_parse_document_version',
label=_('Document version parsing')
) )

View File

@@ -0,0 +1,29 @@
from __future__ import unicode_literals
import logging
from django.apps import apps
from mayan.celery import app
logger = logging.getLogger(__name__)
@app.task(ignore_result=True)
def task_parse_document_version(document_version_pk):
DocumentVersion = apps.get_model(
app_label='documents', model_name='DocumentVersion'
)
DocumentPageContent = apps.get_model(
app_label='document_parsing', model_name='DocumentPageContent'
)
document_version = DocumentVersion.objects.get(
pk=document_version_pk
)
logger.info(
'Starting parsing for document version: %s', document_version
)
DocumentPageContent.objects.process_document_version(
document_version=document_version
)

View File

@@ -1,88 +0,0 @@
from __future__ import unicode_literals
import json
from django.contrib.auth import get_user_model
from django.urls import reverse
from rest_framework import status
from documents.models import DocumentType
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
from rest_api.tests import BaseAPITestCase
from user_management.tests import (
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
)
class OCRAPITestCase(BaseAPITestCase):
"""
Test the OCR app API endpoints
"""
def setUp(self):
super(OCRAPITestCase, self).setUp()
self.admin_user = get_user_model().objects.create_superuser(
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
password=TEST_ADMIN_PASSWORD
)
self.client.login(
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
)
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document_type.delete()
super(OCRAPITestCase, self).tearDown()
def test_submit_document(self):
response = self.client.post(
reverse(
'rest_api:document-ocr-submit-view',
args=(self.document.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_submit_document_version(self):
response = self.client.post(
reverse(
'rest_api:document-version-ocr-submit-view',
args=(self.document.latest_version.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_get_document_version_page_content(self):
response = self.client.get(
reverse(
'rest_api:document-page-content-view',
args=(self.document.latest_version.pages.first().pk,)
),
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertTrue(
'Mayan EDMS Documentation' in json.loads(response.content)['content']
)

View File

@@ -2,40 +2,38 @@ from __future__ import unicode_literals
from actstream.models import Action from actstream.models import Action
from documents.tests.literals import TEST_DOCUMENT_FILENAME
from documents.tests.test_models import GenericDocumentTestCase from documents.tests.test_models import GenericDocumentTestCase
from ..events import ( from ..events import (
event_ocr_document_version_submit, event_ocr_document_version_finish event_parsing_document_version_submit,
event_parsing_document_version_finish
) )
class OCREventsTestCase(GenericDocumentTestCase): class DocumentParsingEventsTestCase(GenericDocumentTestCase):
# Ensure we use a PDF file
test_document_filename = TEST_DOCUMENT_FILENAME
def test_document_version_submit_event(self): def test_document_version_submit_event(self):
Action.objects.all().delete() Action.objects.all().delete()
self.document.submit_for_ocr() self.document.submit_for_parsing()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_ocr_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
from ..models import DocumentVersionOCRError, DocumentPageContent
#print DocumentVersionOCRError.objects.all()
print DocumentPageContent.objects.all()
for a in Action.objects.all():
print a
self.assertEqual( self.assertEqual(
Action.objects.last().target, self.document.latest_version Action.objects.last().target, self.document.latest_version
) )
self.assertEqual( self.assertEqual(
Action.objects.last().verb, Action.objects.last().verb,
event_ocr_document_version_finish.name event_parsing_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_parsing()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_parsing_document_version_finish.name
) )

View File

@@ -1,77 +0,0 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.settings import setting_language_choices
from documents.tests import (
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
)
class DocumentOCRTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(DocumentOCRTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document.delete()
self.document_type.delete()
super(DocumentOCRTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
class GermanOCRSupportTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(GermanOCRSupportTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
# Get corresponding language code for German from the default language
# choices list
language_code = [
language for language in setting_language_choices.value if language[1] == 'German'
][0][0]
self.assertEqual('deu', language_code)
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object, language=language_code
)
def tearDown(self):
self.document_type.delete()
super(GermanOCRSupportTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue(
'Repository für elektronische Dokumente.' in content
)
self.assertTrue(
'Es bietet einen' in content
)

View File

@@ -5,12 +5,9 @@ from django.test import override_settings
from common.tests import BaseTestCase from common.tests import BaseTestCase
from documents.models import DocumentType from documents.models import DocumentType
from documents.tests import ( from documents.tests import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
)
from ..classes import TextExtractor from ..parsers import PopplerParser
from ..parsers import PDFMinerParser, PopplerParser
@override_settings(OCR_AUTO_OCR=False) @override_settings(OCR_AUTO_OCR=False)
@@ -30,54 +27,11 @@ class ParserTestCase(BaseTestCase):
self.document_type.delete() self.document_type.delete()
super(ParserTestCase, self).tearDown() super(ParserTestCase, self).tearDown()
def test_pdfminer_parser(self):
parser = PDFMinerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
def test_poppler_parser(self): def test_poppler_parser(self):
parser = PopplerParser() parser = PopplerParser()
parser.process_document_version(self.document.latest_version) parser.process_document_version(self.document.latest_version)
self.assertTrue( self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content 'Mayan EDMS Documentation' in self.document.pages.first().content.content
)
@override_settings(OCR_AUTO_OCR=False)
class TextExtractorTestCase(BaseTestCase):
def setUp(self):
super(TextExtractorTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(TextExtractorTestCase, self).tearDown()
def test_text_extractor(self):
TextExtractor.process_document_version(
document_version=self.document.latest_version
)
self.assertEqual(
self.document.latest_version.pages.first().ocr_content.content,
'Sample text',
)
self.assertEqual(
self.document.latest_version.pages.last().ocr_content.content,
'Sample text in image form',
) )

View File

@@ -1,26 +1,25 @@
from __future__ import unicode_literals from __future__ import unicode_literals
from django.test import override_settings from documents.tests.literals import TEST_DOCUMENT_FILENAME
from documents.tests.test_views import GenericDocumentViewTestCase from documents.tests.test_views import GenericDocumentViewTestCase
from ..permissions import permission_ocr_content_view from ..permissions import permission_content_view
from ..utils import get_document_ocr_content from ..utils import get_document_content
@override_settings(OCR_AUTO_OCR=True) class DocumentContentViewsTestCase(GenericDocumentViewTestCase):
class OCRViewsTestCase(GenericDocumentViewTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True _skip_file_descriptor_test = True
# Ensure we use a PDF file
test_document_filename = TEST_DOCUMENT_FILENAME
def setUp(self): def setUp(self):
super(OCRViewsTestCase, self).setUp() super(DocumentContentViewsTestCase, self).setUp()
self.login_user() self.login_user()
def _document_content_view(self): def _document_content_view(self):
return self.get( return self.get(
'ocr:document_content', args=(self.document.pk,) 'document_parsing:document_content', args=(self.document.pk,)
) )
def test_document_content_view_no_permissions(self): def test_document_content_view_no_permissions(self):
@@ -29,7 +28,7 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
self.assertEqual(response.status_code, 403) self.assertEqual(response.status_code, 403)
def test_document_content_view_with_permission(self): def test_document_content_view_with_permission(self):
self.grant_permission(permission=permission_ocr_content_view) self.grant_permission(permission=permission_content_view)
response = self._document_content_view() response = self._document_content_view()
@@ -37,25 +36,25 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
response, 'Mayan EDMS Documentation', status_code=200 response, 'Mayan EDMS Documentation', status_code=200
) )
def test_document_ocr_download_view_no_permission(self): def test_document_parsing_download_view_no_permission(self):
response = self.get( response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,) 'document_parsing:document_content_download', args=(self.document.pk,)
) )
self.assertEqual(response.status_code, 403) self.assertEqual(response.status_code, 403)
def test_document_download_view_with_permission(self): def test_download_view_with_permission(self):
self.expected_content_type = 'application/octet-stream; charset=utf-8' self.expected_content_type = 'application/octet-stream; charset=utf-8'
self.grant_permission(permission=permission_ocr_content_view) self.grant_permission(permission=permission_content_view)
response = self.get( response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,) 'document_parsing:document_content_download', args=(self.document.pk,)
) )
self.assertEqual(response.status_code, 200) self.assertEqual(response.status_code, 200)
self.assert_download_response( self.assert_download_response(
response, content=( response, content=(
''.join(get_document_ocr_content(document=self.document)) ''.join(get_document_content(document=self.document))
), ),
) )

View File

@@ -2,62 +2,43 @@ from __future__ import unicode_literals
from django.conf.urls import url from django.conf.urls import url
from .api_views import ( from .api_views import APIDocumentPageContentView
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
)
from .views import ( from .views import (
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView, DocumentContentView, DocumentContentDownloadView,
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView, DocumentParsingErrorsListView, DocumentSubmitView, DocumentTypeSubmitView,
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView ParseErrorListView
) )
urlpatterns = [ urlpatterns = [
url( url(
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(), r'^documents/(?P<pk>\d+)/content/$', DocumentContentView.as_view(),
name='document_content' name='document_content'
), ),
url( url(
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(), r'^documents/(?P<pk>\d+)/content/download/$',
name='document_submit' DocumentContentDownloadView.as_view(), name='document_content_download'
), ),
url( url(
r'^document/all/submit/$', DocumentAllSubmitView.as_view(), r'^document_types/submit/$', DocumentTypeSubmitView.as_view(),
name='document_submit_all'
),
url(
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
name='document_type_submit' name='document_type_submit'
), ),
url( url(
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(), r'^documents/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
name='document_submit'
),
url(
r'^documents/multiple/submit/$', DocumentSubmitView.as_view(),
name='document_submit_multiple' name='document_submit_multiple'
), ),
url( url(
r'^document_type/(?P<pk>\d+)/ocr/settings/$', r'^documents/(?P<pk>\d+)/errors/$',
DocumentTypeSettingsEditView.as_view(), DocumentParsingErrorsListView.as_view(),
name='document_type_ocr_settings' name='document_parsing_error_list'
), ),
url( url(r'^errors/all/$', ParseErrorListView.as_view(), name='error_list'),
r'^documents/(?P<pk>\d+)/ocr/errors/$',
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
),
url(
r'^documents/(?P<pk>\d+)/ocr/download/$',
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
),
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
] ]
api_urls = [ api_urls = [
url(
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
name='document-ocr-submit-view'
),
url(
r'^document_version/(?P<pk>\d+)/submit/$',
APIDocumentVersionOCRView.as_view(),
name='document-version-ocr-submit-view'
),
url( url(
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(), r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
name='document-page-content-view' name='document-page-content-view'

View File

@@ -6,10 +6,10 @@ from django.utils.html import conditional_escape
from .models import DocumentPageContent from .models import DocumentPageContent
def get_document_ocr_content(document): def get_document_content(document):
for page in document.pages.all(): for page in document.pages.all():
try: try:
page_content = page.ocr_content.content page_content = page.content.content
except DocumentPageContent.DoesNotExist: except DocumentPageContent.DoesNotExist:
pass pass
else: else:

View File

@@ -4,137 +4,27 @@ from django.contrib import messages
from django.http import HttpResponseRedirect from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404 from django.shortcuts import get_object_or_404
from django.urls import reverse from django.urls import reverse
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _, ungettext
from acls.models import AccessControlList
from common.generics import ( from common.generics import (
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView, FormView, MultipleObjectConfirmActionView, SingleObjectDetailView,
SingleObjectEditView, SingleObjectListView SingleObjectDownloadView, SingleObjectListView
) )
from common.mixins import MultipleInstanceActionMixin from documents.models import Document
from documents.models import Document, DocumentType
from .forms import DocumentContentForm, DocumentTypeSelectForm from .forms import DocumentContentForm, DocumentTypeSelectForm
from .models import DocumentVersionOCRError from .models import DocumentVersionParseError
from .permissions import ( from .permissions import permission_content_view, permission_parse_document
permission_ocr_content_view, permission_ocr_document, from .utils import get_document_content
permission_document_type_ocr_setup
)
from .utils import get_document_ocr_content
class DocumentAllSubmitView(ConfirmView): class DocumentContentView(SingleObjectDetailView):
extra_context = {'title': _('Submit all documents for OCR?')}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def view_action(self):
count = 0
for document in Document.objects.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _('%d documents added to the OCR queue.') % count
)
class DocumentSubmitView(ConfirmView):
def get_extra_context(self):
return {
'object': self.get_object(),
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
}
def get_object(self):
return Document.objects.get(pk=self.kwargs['pk'])
def object_action(self, instance):
AccessControlList.objects.check_access(
permissions=permission_ocr_document, user=self.request.user,
obj=instance
)
instance.submit_for_ocr()
def view_action(self):
instance = self.get_object()
self.object_action(instance=instance)
messages.success(
self.request,
_('Document: %(document)s was added to the OCR queue.') % {
'document': instance
}
)
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
model = Document
success_message = '%(count)d document submitted to the OCR queue.'
success_message_plural = '%(count)d documents submitted to the OCR queue.'
def get_extra_context(self):
# Override the base class method
return {
'title': _('Submit the selected documents to the OCR queue?')
}
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for OCR')
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'OCR queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
)
return HttpResponseRedirect(self.get_success_url())
class DocumentTypeSettingsEditView(SingleObjectEditView):
fields = ('auto_ocr',)
view_permission = permission_document_type_ocr_setup
def get_object(self, queryset=None):
return get_object_or_404(
DocumentType, pk=self.kwargs['pk']
).ocr_settings
def get_extra_context(self):
return {
'title': _(
'Edit OCR settings for document type: %s'
) % self.get_object().document_type
}
class DocumentOCRContent(SingleObjectDetailView):
form_class = DocumentContentForm form_class = DocumentContentForm
model = Document model = Document
object_permission = permission_ocr_content_view object_permission = permission_content_view
def dispatch(self, request, *args, **kwargs): def dispatch(self, request, *args, **kwargs):
result = super(DocumentOCRContent, self).dispatch( result = super(DocumentContentView, self).dispatch(
request, *args, **kwargs request, *args, **kwargs
) )
self.get_object().add_as_recent_document_for_user(request.user) self.get_object().add_as_recent_document_for_user(request.user)
@@ -145,23 +35,25 @@ class DocumentOCRContent(SingleObjectDetailView):
'document': self.get_object(), 'document': self.get_object(),
'hide_labels': True, 'hide_labels': True,
'object': self.get_object(), 'object': self.get_object(),
'title': _('OCR result for document: %s') % self.get_object(), 'title': _('Content for document: %s') % self.get_object(),
} }
class EntryListView(SingleObjectListView): class DocumentContentDownloadView(SingleObjectDownloadView):
extra_context = { model = Document
'hide_object': True, object_permission = permission_content_view
'title': _('OCR errors'),
}
view_permission = permission_ocr_document
def get_object_list(self): def get_file(self):
return DocumentVersionOCRError.objects.all() file_object = DocumentContentDownloadView.TextIteratorIO(
iterator=get_document_content(document=self.get_object())
)
return DocumentContentDownloadView.VirtualFile(
file=file_object, name='{}-content'.format(self.get_object())
)
class DocumentOCRErrorsListView(SingleObjectListView): class DocumentParsingErrorsListView(SingleObjectListView):
view_permission = permission_ocr_document view_permission = permission_content_view
def get_document(self): def get_document(self):
return get_object_or_404(Document, pk=self.kwargs['pk']) return get_object_or_404(Document, pk=self.kwargs['pk'])
@@ -170,21 +62,93 @@ class DocumentOCRErrorsListView(SingleObjectListView):
return { return {
'hide_object': True, 'hide_object': True,
'object': self.get_document(), 'object': self.get_document(),
'title': _('OCR errors for document: %s') % self.get_document(), 'title': _(
'Parsing errors for document: %s'
) % self.get_document(),
} }
def get_object_list(self): def get_object_list(self):
return self.get_document().latest_version.ocr_errors.all() return self.get_document().latest_version.parsing_errors.all()
class DocumentOCRDownloadView(SingleObjectDownloadView): class DocumentSubmitView(MultipleObjectConfirmActionView):
model = Document model = Document
object_permission = permission_ocr_content_view object_permission = permission_parse_document
success_message = _(
'%(count)d document added to the parsing queue'
)
success_message_plural = _(
'%(count)d documents added to the parsing queue'
)
def get_file(self): def get_extra_context(self):
file_object = DocumentOCRDownloadView.TextIteratorIO( queryset = self.get_queryset()
iterator=get_document_ocr_content(document=self.get_object())
) result = {
return DocumentOCRDownloadView.VirtualFile( 'title': ungettext(
file=file_object, name='{}-OCR'.format(self.get_object()) singular='Submit %(count)d document to the parsing queue?',
plural='Submit %(count)d documents to the parsing queue',
number=queryset.count()
) % {
'count': queryset.count(),
}
}
if queryset.count() == 1:
result.update(
{
'object': queryset.first(),
'title': _(
'Submit document "%s" to the parsing queue'
) % queryset.first()
}
)
return result
def object_action(self, instance, form=None):
instance.submit_for_parsing()
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for parsing')
}
def get_form_extra_kwargs(self):
return {
'user': self.request.user
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_parsing()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'parsing queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
) )
return HttpResponseRedirect(self.get_success_url())
class ParseErrorListView(SingleObjectListView):
extra_context = {
'hide_object': True,
'title': _('Parsing errors'),
}
view_permission = permission_content_view
def get_object_list(self):
return DocumentVersionParseError.objects.all()

View File

@@ -0,0 +1,21 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-08-23 18:55
from __future__ import unicode_literals
from django.db import migrations, models
import uuid
class Migration(migrations.Migration):
dependencies = [
('documents', '0040_auto_20170725_1111'),
]
operations = [
migrations.AlterField(
model_name='document',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, editable=False),
),
]