diff --git a/mayan/apps/document_parsing/admin.py b/mayan/apps/document_parsing/admin.py index 1bb19bf3ac..258da5ec3d 100644 --- a/mayan/apps/document_parsing/admin.py +++ b/mayan/apps/document_parsing/admin.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals from django.contrib import admin from .models import ( - DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError + DocumentPageContent, DocumentVersionParseError ) @@ -12,12 +12,7 @@ class DocumentPageContentAdmin(admin.ModelAdmin): list_display = ('document_page',) -@admin.register(DocumentTypeSettings) -class DocumentTypeSettingsAdmin(admin.ModelAdmin): - list_display = ('document_type', 'auto_ocr') - - -@admin.register(DocumentVersionOCRError) -class DocumentVersionOCRErrorAdmin(admin.ModelAdmin): +@admin.register(DocumentVersionParseError) +class DocumentVersionParseErrorAdmin(admin.ModelAdmin): list_display = ('document_version', 'datetime_submitted') readonly_fields = ('document_version', 'datetime_submitted', 'result') diff --git a/mayan/apps/document_parsing/api_views.py b/mayan/apps/document_parsing/api_views.py index ded56e8ed8..a074fec935 100644 --- a/mayan/apps/document_parsing/api_views.py +++ b/mayan/apps/document_parsing/api_views.py @@ -1,75 +1,19 @@ from __future__ import absolute_import, unicode_literals -from rest_framework import generics, status +from rest_framework import generics from rest_framework.response import Response -from documents.models import Document, DocumentPage, DocumentVersion +from documents.models import DocumentPage from rest_api.permissions import MayanPermission from .models import DocumentPageContent -from .permissions import permission_ocr_content_view, permission_ocr_document +from .permissions import permission_content_view from .serializers import DocumentPageContentSerializer -class APIDocumentOCRView(generics.GenericAPIView): - mayan_object_permissions = { - 'POST': (permission_ocr_document,) - } - permission_classes = (MayanPermission,) - queryset = Document.objects.all() - - def get_serializer_class(self): - return None - - def post(self, request, *args, **kwargs): - """ - Submit a document for OCR. - --- - omit_serializer: true - parameters: - - name: pk - paramType: path - type: number - responseMessages: - - code: 202 - message: Accepted - """ - - self.get_object().submit_for_ocr() - return Response(status=status.HTTP_202_ACCEPTED) - - -class APIDocumentVersionOCRView(generics.GenericAPIView): - mayan_object_permissions = { - 'POST': (permission_ocr_document,) - } - permission_classes = (MayanPermission,) - queryset = DocumentVersion.objects.all() - - def get_serializer_class(self): - return None - - def post(self, request, *args, **kwargs): - """ - Submit a document version for OCR. - --- - omit_serializer: true - parameters: - - name: pk - paramType: path - type: number - responseMessages: - - code: 202 - message: Accepted - """ - - self.get_object().submit_for_ocr() - return Response(status=status.HTTP_202_ACCEPTED) - - class APIDocumentPageContentView(generics.RetrieveAPIView): """ - Returns the OCR content of the selected document page. + Returns the content of the selected document page. --- GET: parameters: @@ -79,7 +23,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView): """ mayan_object_permissions = { - 'GET': (permission_ocr_content_view,), + 'GET': (permission_content_view,), } permission_classes = (MayanPermission,) serializer_class = DocumentPageContentSerializer @@ -89,9 +33,9 @@ class APIDocumentPageContentView(generics.RetrieveAPIView): instance = self.get_object() try: - ocr_content = instance.ocr_content + content = instance.content except DocumentPageContent.DoesNotExist: - ocr_content = DocumentPageContent.objects.none() + content = DocumentPageContent.objects.none() - serializer = self.get_serializer(ocr_content) + serializer = self.get_serializer(content) return Response(serializer.data) diff --git a/mayan/apps/document_parsing/apps.py b/mayan/apps/document_parsing/apps.py index 6b9a68d499..d65953daf7 100644 --- a/mayan/apps/document_parsing/apps.py +++ b/mayan/apps/document_parsing/apps.py @@ -1,11 +1,12 @@ from __future__ import unicode_literals +from datetime import timedelta import logging from kombu import Exchange, Queue from django.apps import apps -from django.db.models.signals import post_save +from django.utils.timezone import now from django.utils.translation import ugettext_lazy as _ from acls import ModelPermission @@ -21,16 +22,38 @@ from mayan.celery import app from navigation import SourceColumn from rest_api.classes import APIEndPoint +from .events import event_parsing_document_version_submit from .handlers import handler_parse_document_version from .links import ( - link_document_content, link_entry_list, link_document_content_errors_list, - link_document_content_download + link_document_content, link_document_content_download, + link_document_parsing_errors_list, link_document_submit_multiple, + link_document_submit, link_document_type_submit, link_error_list ) from .permissions import permission_content_view logger = logging.getLogger(__name__) +def document_parsing_submit(self): + latest_version = self.latest_version + # Don't error out if document has no version + if latest_version: + latest_version.submit_for_parsing() + + +def document_version_parsing_submit(self): + from .tasks import task_parse_document_version + + event_parsing_document_version_submit.commit( + action_object=self.document, target=self + ) + + task_parse_document_version.apply_async( + eta=now() + timedelta(seconds=settings_db_sync_task_delay.value), + kwargs={'document_version_pk': self.pk}, + ) + + class DocumentParsingApp(MayanAppConfig): has_tests = True name = 'document_parsing' @@ -45,16 +68,17 @@ class DocumentParsingApp(MayanAppConfig): app_label='documents', model_name='Document' ) - DocumentType = apps.get_model( - app_label='documents', model_name='DocumentType' - ) - DocumentVersion = apps.get_model( app_label='documents', model_name='DocumentVersion' ) DocumentVersionParseError = self.get_model('DocumentVersionParseError') + Document.add_to_class('submit_for_parsing', document_parsing_submit) + DocumentVersion.add_to_class( + 'submit_for_parsing', document_version_parsing_submit + ) + ModelPermission.register( model=Document, permissions=(permission_content_view,) ) @@ -72,6 +96,18 @@ class DocumentParsingApp(MayanAppConfig): attribute='result' ) + app.conf.CELERY_QUEUES.append( + Queue('parsing', Exchange('parsing'), routing_key='parsing'), + ) + + app.conf.CELERY_ROUTES.update( + { + 'document_parsing.tasks.task_parse_document_version': { + 'queue': 'parsing' + }, + } + ) + document_search.add_model_field( field='versions__pages__content__content', label=_('Content') ) @@ -89,32 +125,20 @@ class DocumentParsingApp(MayanAppConfig): menu_object.bind_links( links=(link_document_submit,), sources=(Document,) ) - menu_object.bind_links( - links=(link_document_type_ocr_settings,), sources=(DocumentType,) - ) menu_secondary.bind_links( links=( - link_document_content, link_document_ocr_erros_list, - link_document_ocr_download + link_document_content, link_document_parsing_errors_list, + link_document_content_download ), sources=( 'document_parsing:document_content', - 'document_parsing:document_ocr_error_list', - 'document_parsing:document_ocr_download', - ) - ) - menu_secondary.bind_links( - links=(link_entry_list,), - sources=( - 'document_parsing:entry_list', - 'document_parsing:entry_delete_multiple', - 'document_parsing:entry_re_queue_multiple', - DocumentVersionParseError + 'document_parsing:document_content_download', + 'document_parsing:document_parsing_error_list', ) ) menu_tools.bind_links( links=( - link_entry_list + link_document_type_submit, link_error_list, ) ) diff --git a/mayan/apps/document_parsing/events.py b/mayan/apps/document_parsing/events.py new file mode 100644 index 0000000000..875527e911 --- /dev/null +++ b/mayan/apps/document_parsing/events.py @@ -0,0 +1,14 @@ +from __future__ import absolute_import, unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from events.classes import Event + +event_parsing_document_version_submit = Event( + name='parsing_document_version_submit', + label=_('Document version submitted for parsing') +) +event_parsing_document_version_finish = Event( + name='parsing_document_version_finish', + label=_('Document version parsing finished') +) diff --git a/mayan/apps/document_parsing/exceptions.py b/mayan/apps/document_parsing/exceptions.py index 9fc7a9b90a..76f872cabd 100644 --- a/mayan/apps/document_parsing/exceptions.py +++ b/mayan/apps/document_parsing/exceptions.py @@ -1,13 +1,6 @@ from __future__ import unicode_literals -class OCRError(Exception): - """ - Raised by the OCR backend - """ - pass - - class ParserError(Exception): """ Base exception for file parsers diff --git a/mayan/apps/document_parsing/forms.py b/mayan/apps/document_parsing/forms.py index 0881a9185a..0a7fe44a5a 100644 --- a/mayan/apps/document_parsing/forms.py +++ b/mayan/apps/document_parsing/forms.py @@ -6,10 +6,12 @@ from django.utils.html import conditional_escape from django.utils.safestring import mark_safe from django.utils.translation import ugettext_lazy as _, ugettext +from acls.models import AccessControlList from common.widgets import TextAreaDiv from documents.models import DocumentType -from .models import DocumentPageContent, DocumentPageOCRContent +from .models import DocumentPageContent +from .permissions import permission_parse_document class DocumentContentForm(forms.Form): @@ -29,7 +31,7 @@ class DocumentContentForm(forms.Form): for page in document_pages: try: - page_content = page.ocr_content.content + page_content = page.content.content except DocumentPageContent.DoesNotExist: pass else: @@ -55,50 +57,16 @@ class DocumentContentForm(forms.Form): ) -class DocumentOCRContentForm(forms.Form): - """ - Form that concatenates all of a document pages' text content into a - single textarea widget - """ - def __init__(self, *args, **kwargs): - self.document = kwargs.pop('instance', None) - super(DocumentContentForm, self).__init__(*args, **kwargs) - content = [] - self.fields['contents'].initial = '' - try: - document_pages = self.document.pages.all() - except AttributeError: - document_pages = [] - - for page in document_pages: - try: - page_content = page.ocr_content.content - except DocumentPageOCRContent.DoesNotExist: - pass - else: - content.append(conditional_escape(force_text(page_content))) - content.append( - '\n\n\n
- %s -

\n\n\n' % ( - ugettext( - 'Page %(page_number)d' - ) % {'page_number': page.page_number} - ) - ) - - self.fields['contents'].initial = mark_safe(''.join(content)) - - contents = forms.CharField( - label=_('Contents'), - widget=TextAreaDiv( - attrs={ - 'class': 'text_area_div full-height', - 'data-height-difference': 360 - } - ) - ) - - class DocumentTypeSelectForm(forms.Form): document_type = forms.ModelChoiceField( - queryset=DocumentType.objects.all(), label=('Document type') + queryset=DocumentType.objects.none(), label=('Document type') ) + + def __init__(self, *args, **kwargs): + user = kwargs.pop('user') + super(DocumentTypeSelectForm, self).__init__(*args, **kwargs) + queryset = AccessControlList.objects.filter_by_access( + permission=permission_parse_document, + queryset=DocumentType.objects.all(), user=user, + ) + self.fields['document_type'].queryset = queryset diff --git a/mayan/apps/document_parsing/handlers.py b/mayan/apps/document_parsing/handlers.py index 618826246c..6e3338f484 100644 --- a/mayan/apps/document_parsing/handlers.py +++ b/mayan/apps/document_parsing/handlers.py @@ -2,14 +2,8 @@ from __future__ import unicode_literals import logging -from django.apps import apps - -from .settings import setting_auto_ocr -from .parsers import Parser - logger = logging.getLogger(__name__) def handler_parse_document_version(sender, instance, **kwargs): - if kwargs['created']: - Parser.parse_document_version(document_version=instance) + instance.submit_for_parsing() diff --git a/mayan/apps/document_parsing/links.py b/mayan/apps/document_parsing/links.py index cce30bcad5..ac84081841 100644 --- a/mayan/apps/document_parsing/links.py +++ b/mayan/apps/document_parsing/links.py @@ -4,24 +4,36 @@ from django.utils.translation import ugettext_lazy as _ from navigation import Link -from .permissions import permission_content_view +from .permissions import permission_content_view, permission_parse_document link_document_content = Link( args='resolved_object.id', icon='fa fa-font', permissions=(permission_content_view,), text=_('Content'), view='document_parsing:document_content', ) -link_entry_list = Link( - icon='fa fa-file-text-o', permissions=(permission_ocr_document,), - text=_('Parsing errors'), view='document_parsing:entry_list' -) -link_document_content_errors_list = Link( +link_document_parsing_errors_list = Link( args='resolved_object.id', icon='fa fa-file-text-o', - permissions=(permission_ocr_content_view,), text=_('Parsing errors'), - view='document_parsing:document_page_parsing_error_list' + permissions=(permission_content_view,), text=_('Parsing errors'), + view='document_parsing:document_parsing_error_list' ) link_document_content_download = Link( args='resolved_object.id', icon='fa fa-file-text-o', - permissions=(permission_ocr_content_view,), text=_('Download content'), + permissions=(permission_content_view,), text=_('Download content'), view='document_parsing:document_content_download' ) +link_document_submit_multiple = Link( + text=_('Submit for parsing'), + view='document_parsing:document_submit_multiple' +) +link_document_submit = Link( + args='resolved_object.id', permissions=(permission_parse_document,), + text=_('Submit for parsing'), view='document_parsing:document_submit' +) +link_document_type_submit = Link( + icon='fa fa-crosshairs', text=_('Parse documents per type'), + view='document_parsing:document_type_submit' +) +link_error_list = Link( + icon='fa fa-file-text-o', permissions=(permission_content_view,), + text=_('Parsing errors'), view='document_parsing:error_list' +) diff --git a/mayan/apps/document_parsing/managers.py b/mayan/apps/document_parsing/managers.py index 2e17131486..34f50be8da 100644 --- a/mayan/apps/document_parsing/managers.py +++ b/mayan/apps/document_parsing/managers.py @@ -1,14 +1,50 @@ from __future__ import unicode_literals -from datetime import timedelta import logging +import sys +import traceback -from django.apps import apps +from django.conf import settings from django.db import models -from django.utils.timezone import now + +from .events import event_parsing_document_version_finish +from .parsers import Parser logger = logging.getLogger(__name__) class DocumentPageContentManager(models.Manager): - pass + def process_document_version(self, document_version): + logger.info( + 'Starting parsing for document version: %s', document_version + ) + logger.debug('document version: %d', document_version.pk) + + try: + Parser.parse_document_version(document_version=document_version) + except Exception as exception: + logger.exception( + 'Parsing error for document version: %d; %s', + document_version.pk, exception, + ) + + if settings.DEBUG: + result = [] + type, value, tb = sys.exc_info() + result.append('%s: %s' % (type.__name__, value)) + result.extend(traceback.format_tb(tb)) + document_version.parsing_errors.create( + result='\n'.join(result) + ) + else: + document_version.parsing_errors.create(result=exception) + else: + logger.info( + 'Parsing complete for document version: %s', document_version + ) + document_version.parsing_errors.all().delete() + + event_parsing_document_version_finish.commit( + action_object=document_version.document, + target=document_version + ) diff --git a/mayan/apps/document_parsing/migrations/0001_initial.py b/mayan/apps/document_parsing/migrations/0001_initial.py new file mode 100644 index 0000000000..d421f6cd7b --- /dev/null +++ b/mayan/apps/document_parsing/migrations/0001_initial.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.10.7 on 2017-08-23 18:55 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + ('documents', '0041_auto_20170823_1855'), + ] + + operations = [ + migrations.CreateModel( + name='DocumentPageContent', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('content', models.TextField(blank=True, verbose_name='Content')), + ('document_page', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='content', to='documents.DocumentPage', verbose_name='Document page')), + ], + options={ + 'verbose_name': 'Document page content', + 'verbose_name_plural': 'Document pages contents', + }, + ), + migrations.CreateModel( + name='DocumentVersionParseError', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('datetime_submitted', models.DateTimeField(auto_now_add=True, db_index=True, verbose_name='Date time submitted')), + ('result', models.TextField(blank=True, null=True, verbose_name='Result')), + ('document_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='parse_errors', to='documents.DocumentVersion', verbose_name='Document version')), + ], + options={ + 'ordering': ('datetime_submitted',), + 'verbose_name': 'Document version parse error', + 'verbose_name_plural': 'Document version parse errors', + }, + ), + ] diff --git a/mayan/apps/document_parsing/migrations/__init__.py b/mayan/apps/document_parsing/migrations/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mayan/apps/document_parsing/models.py b/mayan/apps/document_parsing/models.py index 38dc9ff7f1..27d3a21266 100644 --- a/mayan/apps/document_parsing/models.py +++ b/mayan/apps/document_parsing/models.py @@ -4,7 +4,7 @@ from django.db import models from django.utils.encoding import force_text, python_2_unicode_compatible from django.utils.translation import ugettext_lazy as _ -from documents.models import DocumentPage, DocumentType, DocumentVersion +from documents.models import DocumentPage, DocumentVersion from .managers import DocumentPageContentManager @@ -30,11 +30,11 @@ class DocumentPageContent(models.Model): @python_2_unicode_compatible class DocumentVersionParseError(models.Model): document_version = models.ForeignKey( - DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors', - verbose_name=_('Document version') + DocumentVersion, on_delete=models.CASCADE, + related_name='parsing_errors', verbose_name=_('Document version') ) datetime_submitted = models.DateTimeField( - auto_add_now=True, db_index=True, verbose_name=_('Date time submitted') + auto_now_add=True, db_index=True, verbose_name=_('Date time submitted') ) result = models.TextField(blank=True, null=True, verbose_name=_('Result')) diff --git a/mayan/apps/document_parsing/parsers.py b/mayan/apps/document_parsing/parsers.py index 87570afa1f..977b83f437 100644 --- a/mayan/apps/document_parsing/parsers.py +++ b/mayan/apps/document_parsing/parsers.py @@ -1,20 +1,15 @@ from __future__ import unicode_literals -from io import BytesIO import logging import os -from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter -from pdfminer.pdfpage import PDFPage -from pdfminer.converter import TextConverter -from pdfminer.layout import LAParams import subprocess +from django.apps import apps from django.utils.translation import ugettext_lazy as _ from common.utils import copyfile, fs_cleanup, mkstemp from .exceptions import ParserError, NoMIMETypeMatch -from .models import DocumentPageContent from .settings import setting_pdftotext_path logger = logging.getLogger(__name__) @@ -82,6 +77,10 @@ class Parser(object): self.process_document_page(document_page=document_page) def process_document_page(self, document_page): + DocumentPageContent = apps.get_model( + app_label='document_parsing', model_name='DocumentPageContent' + ) + logger.info( 'Processing page: %d of document version: %s', document_page.page_number, document_page.document_version @@ -171,32 +170,7 @@ class PopplerParser(Parser): return output -class PDFMinerParser(Parser): - """ - Parser for PDF files using the PDFMiner library for Python - """ - - def execute(self, file_object, page_number): - logger.debug('Parsing PDF page: %d', page_number) - - with BytesIO() as string_buffer: - rsrcmgr = PDFResourceManager() - device = TextConverter( - rsrcmgr, outfp=string_buffer, laparams=LAParams() - ) - interpreter = PDFPageInterpreter(rsrcmgr, device) - page = PDFPage.get_pages( - file_object, maxpages=1, pagenos=(page_number - 1,) - ) - interpreter.process_page(page.next()) - device.close() - - logger.debug('Finished parsing PDF: %d', page_number) - - return string_buffer.getvalue() - - Parser.register( mimetypes=('application/pdf',), - parser_classes=(PopplerParser, PDFMinerParser) + parser_classes=(PopplerParser,) ) diff --git a/mayan/apps/document_parsing/permissions.py b/mayan/apps/document_parsing/permissions.py index fd003b8f35..f58deedfcb 100644 --- a/mayan/apps/document_parsing/permissions.py +++ b/mayan/apps/document_parsing/permissions.py @@ -9,3 +9,7 @@ namespace = PermissionNamespace('document_parsing', _('Document parsing')) permission_content_view = namespace.add_permission( name='content_view', label=_('View the content of a document') ) + +permission_parse_document = namespace.add_permission( + name='parse_document', label=_('Parse the content of a document') +) diff --git a/mayan/apps/document_parsing/queues.py b/mayan/apps/document_parsing/queues.py index 92297a2524..42036d2420 100644 --- a/mayan/apps/document_parsing/queues.py +++ b/mayan/apps/document_parsing/queues.py @@ -4,7 +4,8 @@ from django.utils.translation import ugettext_lazy as _ from task_manager.classes import CeleryQueue -queue_ocr = CeleryQueue(name='ocr', label=_('OCR')) +queue_ocr = CeleryQueue(name='parsing', label=_('Parsing')) queue_ocr.add_task_type( - name='ocr.tasks.task_do_ocr', label=_('Document version OCR') + name='document_parsing.tasks.task_parse_document_version', + label=_('Document version parsing') ) diff --git a/mayan/apps/document_parsing/tasks.py b/mayan/apps/document_parsing/tasks.py new file mode 100644 index 0000000000..4debffbc60 --- /dev/null +++ b/mayan/apps/document_parsing/tasks.py @@ -0,0 +1,29 @@ +from __future__ import unicode_literals + +import logging + +from django.apps import apps + +from mayan.celery import app + +logger = logging.getLogger(__name__) + + +@app.task(ignore_result=True) +def task_parse_document_version(document_version_pk): + DocumentVersion = apps.get_model( + app_label='documents', model_name='DocumentVersion' + ) + DocumentPageContent = apps.get_model( + app_label='document_parsing', model_name='DocumentPageContent' + ) + + document_version = DocumentVersion.objects.get( + pk=document_version_pk + ) + logger.info( + 'Starting parsing for document version: %s', document_version + ) + DocumentPageContent.objects.process_document_version( + document_version=document_version + ) diff --git a/mayan/apps/document_parsing/tests/test_api.py b/mayan/apps/document_parsing/tests/test_api.py deleted file mode 100644 index fb73bef98d..0000000000 --- a/mayan/apps/document_parsing/tests/test_api.py +++ /dev/null @@ -1,88 +0,0 @@ -from __future__ import unicode_literals - -import json - -from django.contrib.auth import get_user_model -from django.urls import reverse - -from rest_framework import status - -from documents.models import DocumentType -from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH -from rest_api.tests import BaseAPITestCase -from user_management.tests import ( - TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME -) - - -class OCRAPITestCase(BaseAPITestCase): - """ - Test the OCR app API endpoints - """ - - def setUp(self): - super(OCRAPITestCase, self).setUp() - - self.admin_user = get_user_model().objects.create_superuser( - username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL, - password=TEST_ADMIN_PASSWORD - ) - - self.client.login( - username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD - ) - - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - with open(TEST_SMALL_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=file_object, - ) - - def tearDown(self): - self.document_type.delete() - super(OCRAPITestCase, self).tearDown() - - def test_submit_document(self): - response = self.client.post( - reverse( - 'rest_api:document-ocr-submit-view', - args=(self.document.pk,) - ) - ) - - self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) - - content = self.document.pages.first().ocr_content.content - - self.assertTrue('Mayan EDMS Documentation' in content) - - def test_submit_document_version(self): - response = self.client.post( - reverse( - 'rest_api:document-version-ocr-submit-view', - args=(self.document.latest_version.pk,) - ) - ) - - self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) - - content = self.document.pages.first().ocr_content.content - - self.assertTrue('Mayan EDMS Documentation' in content) - - def test_get_document_version_page_content(self): - response = self.client.get( - reverse( - 'rest_api:document-page-content-view', - args=(self.document.latest_version.pages.first().pk,) - ), - ) - - self.assertEqual(response.status_code, status.HTTP_200_OK) - - self.assertTrue( - 'Mayan EDMS Documentation' in json.loads(response.content)['content'] - ) diff --git a/mayan/apps/document_parsing/tests/test_events.py b/mayan/apps/document_parsing/tests/test_events.py index dc366623d2..073cc90663 100644 --- a/mayan/apps/document_parsing/tests/test_events.py +++ b/mayan/apps/document_parsing/tests/test_events.py @@ -2,40 +2,38 @@ from __future__ import unicode_literals from actstream.models import Action +from documents.tests.literals import TEST_DOCUMENT_FILENAME from documents.tests.test_models import GenericDocumentTestCase from ..events import ( - event_ocr_document_version_submit, event_ocr_document_version_finish + event_parsing_document_version_submit, + event_parsing_document_version_finish ) -class OCREventsTestCase(GenericDocumentTestCase): +class DocumentParsingEventsTestCase(GenericDocumentTestCase): + # Ensure we use a PDF file + test_document_filename = TEST_DOCUMENT_FILENAME + def test_document_version_submit_event(self): Action.objects.all().delete() - self.document.submit_for_ocr() - - self.assertEqual( - Action.objects.first().target, self.document.latest_version - ) - self.assertEqual( - Action.objects.first().verb, - event_ocr_document_version_submit.name - ) - - def test_document_version_finish_event(self): - Action.objects.all().delete() - self.document.submit_for_ocr() - from ..models import DocumentVersionOCRError, DocumentPageContent - #print DocumentVersionOCRError.objects.all() - print DocumentPageContent.objects.all() - - for a in Action.objects.all(): - print a + self.document.submit_for_parsing() self.assertEqual( Action.objects.last().target, self.document.latest_version ) self.assertEqual( Action.objects.last().verb, - event_ocr_document_version_finish.name + event_parsing_document_version_submit.name + ) + + def test_document_version_finish_event(self): + Action.objects.all().delete() + self.document.submit_for_parsing() + self.assertEqual( + Action.objects.first().target, self.document.latest_version + ) + self.assertEqual( + Action.objects.first().verb, + event_parsing_document_version_finish.name ) diff --git a/mayan/apps/document_parsing/tests/test_models.py b/mayan/apps/document_parsing/tests/test_models.py deleted file mode 100644 index 36dbb57f67..0000000000 --- a/mayan/apps/document_parsing/tests/test_models.py +++ /dev/null @@ -1,77 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals - -from common.tests import BaseTestCase -from documents.models import DocumentType -from documents.settings import setting_language_choices -from documents.tests import ( - TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH -) - - -class DocumentOCRTestCase(BaseTestCase): - # PyOCR's leak descriptor in get_available_languages and image_to_string - # Disable descriptor leak test until fixed in upstream - _skip_file_descriptor_test = True - - def setUp(self): - super(DocumentOCRTestCase, self).setUp() - - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - with open(TEST_SMALL_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=file_object, - ) - - def tearDown(self): - self.document.delete() - self.document_type.delete() - super(DocumentOCRTestCase, self).tearDown() - - def test_ocr_language_backends_end(self): - content = self.document.pages.first().ocr_content.content - self.assertTrue('Mayan EDMS Documentation' in content) - - -class GermanOCRSupportTestCase(BaseTestCase): - # PyOCR's leak descriptor in get_available_languages and image_to_string - # Disable descriptor leak test until fixed in upstream - _skip_file_descriptor_test = True - - def setUp(self): - super(GermanOCRSupportTestCase, self).setUp() - - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - # Get corresponding language code for German from the default language - # choices list - language_code = [ - language for language in setting_language_choices.value if language[1] == 'German' - ][0][0] - - self.assertEqual('deu', language_code) - - with open(TEST_DEU_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=file_object, language=language_code - ) - - def tearDown(self): - self.document_type.delete() - super(GermanOCRSupportTestCase, self).tearDown() - - def test_ocr_language_backends_end(self): - content = self.document.pages.first().ocr_content.content - - self.assertTrue( - 'Repository für elektronische Dokumente.' in content - ) - self.assertTrue( - 'Es bietet einen' in content - ) diff --git a/mayan/apps/document_parsing/tests/test_parsers.py b/mayan/apps/document_parsing/tests/test_parsers.py index 9d500a572a..a84f180bae 100644 --- a/mayan/apps/document_parsing/tests/test_parsers.py +++ b/mayan/apps/document_parsing/tests/test_parsers.py @@ -5,12 +5,9 @@ from django.test import override_settings from common.tests import BaseTestCase from documents.models import DocumentType -from documents.tests import ( - TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH -) +from documents.tests import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL -from ..classes import TextExtractor -from ..parsers import PDFMinerParser, PopplerParser +from ..parsers import PopplerParser @override_settings(OCR_AUTO_OCR=False) @@ -30,54 +27,11 @@ class ParserTestCase(BaseTestCase): self.document_type.delete() super(ParserTestCase, self).tearDown() - def test_pdfminer_parser(self): - parser = PDFMinerParser() - - parser.process_document_version(self.document.latest_version) - - self.assertTrue( - 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content - ) - def test_poppler_parser(self): parser = PopplerParser() parser.process_document_version(self.document.latest_version) self.assertTrue( - 'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content - ) - - -@override_settings(OCR_AUTO_OCR=False) -class TextExtractorTestCase(BaseTestCase): - def setUp(self): - super(TextExtractorTestCase, self).setUp() - - self.document_type = DocumentType.objects.create( - label=TEST_DOCUMENT_TYPE_LABEL - ) - - with open(TEST_HYBRID_DOCUMENT_PATH) as file_object: - self.document = self.document_type.new_document( - file_object=File(file_object) - ) - - def tearDown(self): - self.document_type.delete() - super(TextExtractorTestCase, self).tearDown() - - def test_text_extractor(self): - TextExtractor.process_document_version( - document_version=self.document.latest_version - ) - - self.assertEqual( - self.document.latest_version.pages.first().ocr_content.content, - 'Sample text', - ) - - self.assertEqual( - self.document.latest_version.pages.last().ocr_content.content, - 'Sample text in image form', + 'Mayan EDMS Documentation' in self.document.pages.first().content.content ) diff --git a/mayan/apps/document_parsing/tests/test_views.py b/mayan/apps/document_parsing/tests/test_views.py index 41b0462103..995250e09e 100644 --- a/mayan/apps/document_parsing/tests/test_views.py +++ b/mayan/apps/document_parsing/tests/test_views.py @@ -1,26 +1,25 @@ from __future__ import unicode_literals -from django.test import override_settings - +from documents.tests.literals import TEST_DOCUMENT_FILENAME from documents.tests.test_views import GenericDocumentViewTestCase -from ..permissions import permission_ocr_content_view -from ..utils import get_document_ocr_content +from ..permissions import permission_content_view +from ..utils import get_document_content -@override_settings(OCR_AUTO_OCR=True) -class OCRViewsTestCase(GenericDocumentViewTestCase): - # PyOCR's leak descriptor in get_available_languages and image_to_string - # Disable descriptor leak test until fixed in upstream +class DocumentContentViewsTestCase(GenericDocumentViewTestCase): _skip_file_descriptor_test = True + # Ensure we use a PDF file + test_document_filename = TEST_DOCUMENT_FILENAME + def setUp(self): - super(OCRViewsTestCase, self).setUp() + super(DocumentContentViewsTestCase, self).setUp() self.login_user() def _document_content_view(self): return self.get( - 'ocr:document_content', args=(self.document.pk,) + 'document_parsing:document_content', args=(self.document.pk,) ) def test_document_content_view_no_permissions(self): @@ -29,7 +28,7 @@ class OCRViewsTestCase(GenericDocumentViewTestCase): self.assertEqual(response.status_code, 403) def test_document_content_view_with_permission(self): - self.grant_permission(permission=permission_ocr_content_view) + self.grant_permission(permission=permission_content_view) response = self._document_content_view() @@ -37,25 +36,25 @@ class OCRViewsTestCase(GenericDocumentViewTestCase): response, 'Mayan EDMS Documentation', status_code=200 ) - def test_document_ocr_download_view_no_permission(self): + def test_document_parsing_download_view_no_permission(self): response = self.get( - 'ocr:document_ocr_download', args=(self.document.pk,) + 'document_parsing:document_content_download', args=(self.document.pk,) ) self.assertEqual(response.status_code, 403) - def test_document_download_view_with_permission(self): + def test_download_view_with_permission(self): self.expected_content_type = 'application/octet-stream; charset=utf-8' - self.grant_permission(permission=permission_ocr_content_view) + self.grant_permission(permission=permission_content_view) response = self.get( - 'ocr:document_ocr_download', args=(self.document.pk,) + 'document_parsing:document_content_download', args=(self.document.pk,) ) self.assertEqual(response.status_code, 200) self.assert_download_response( response, content=( - ''.join(get_document_ocr_content(document=self.document)) + ''.join(get_document_content(document=self.document)) ), ) diff --git a/mayan/apps/document_parsing/urls.py b/mayan/apps/document_parsing/urls.py index 30f1b59359..334babf9bd 100644 --- a/mayan/apps/document_parsing/urls.py +++ b/mayan/apps/document_parsing/urls.py @@ -2,62 +2,43 @@ from __future__ import unicode_literals from django.conf.urls import url -from .api_views import ( - APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView -) +from .api_views import APIDocumentPageContentView from .views import ( - DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView, - DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView, - DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView + DocumentContentView, DocumentContentDownloadView, + DocumentParsingErrorsListView, DocumentSubmitView, DocumentTypeSubmitView, + ParseErrorListView ) urlpatterns = [ url( - r'^(?P\d+)/content/$', DocumentOCRContent.as_view(), + r'^documents/(?P\d+)/content/$', DocumentContentView.as_view(), name='document_content' ), url( - r'^document/(?P\d+)/submit/$', DocumentSubmitView.as_view(), - name='document_submit' + r'^documents/(?P\d+)/content/download/$', + DocumentContentDownloadView.as_view(), name='document_content_download' ), url( - r'^document/all/submit/$', DocumentAllSubmitView.as_view(), - name='document_submit_all' - ), - url( - r'^document/type/submit/$', DocumentTypeSubmitView.as_view(), + r'^document_types/submit/$', DocumentTypeSubmitView.as_view(), name='document_type_submit' ), url( - r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(), + r'^documents/(?P\d+)/submit/$', DocumentSubmitView.as_view(), + name='document_submit' + ), + url( + r'^documents/multiple/submit/$', DocumentSubmitView.as_view(), name='document_submit_multiple' ), url( - r'^document_type/(?P\d+)/ocr/settings/$', - DocumentTypeSettingsEditView.as_view(), - name='document_type_ocr_settings' + r'^documents/(?P\d+)/errors/$', + DocumentParsingErrorsListView.as_view(), + name='document_parsing_error_list' ), - url( - r'^documents/(?P\d+)/ocr/errors/$', - DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list' - ), - url( - r'^documents/(?P\d+)/ocr/download/$', - DocumentOCRDownloadView.as_view(), name='document_ocr_download' - ), - url(r'^all/$', EntryListView.as_view(), name='entry_list'), + url(r'^errors/all/$', ParseErrorListView.as_view(), name='error_list'), ] api_urls = [ - url( - r'^document/(?P\d+)/submit/$', APIDocumentOCRView.as_view(), - name='document-ocr-submit-view' - ), - url( - r'^document_version/(?P\d+)/submit/$', - APIDocumentVersionOCRView.as_view(), - name='document-version-ocr-submit-view' - ), url( r'^page/(?P\d+)/content/$', APIDocumentPageContentView.as_view(), name='document-page-content-view' diff --git a/mayan/apps/document_parsing/utils.py b/mayan/apps/document_parsing/utils.py index 8175c3040e..bbb498af20 100644 --- a/mayan/apps/document_parsing/utils.py +++ b/mayan/apps/document_parsing/utils.py @@ -6,10 +6,10 @@ from django.utils.html import conditional_escape from .models import DocumentPageContent -def get_document_ocr_content(document): +def get_document_content(document): for page in document.pages.all(): try: - page_content = page.ocr_content.content + page_content = page.content.content except DocumentPageContent.DoesNotExist: pass else: diff --git a/mayan/apps/document_parsing/views.py b/mayan/apps/document_parsing/views.py index fa0871735b..032371d853 100644 --- a/mayan/apps/document_parsing/views.py +++ b/mayan/apps/document_parsing/views.py @@ -4,137 +4,27 @@ from django.contrib import messages from django.http import HttpResponseRedirect from django.shortcuts import get_object_or_404 from django.urls import reverse -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext_lazy as _, ungettext -from acls.models import AccessControlList from common.generics import ( - ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView, - SingleObjectEditView, SingleObjectListView + FormView, MultipleObjectConfirmActionView, SingleObjectDetailView, + SingleObjectDownloadView, SingleObjectListView ) -from common.mixins import MultipleInstanceActionMixin -from documents.models import Document, DocumentType +from documents.models import Document from .forms import DocumentContentForm, DocumentTypeSelectForm -from .models import DocumentVersionOCRError -from .permissions import ( - permission_ocr_content_view, permission_ocr_document, - permission_document_type_ocr_setup -) -from .utils import get_document_ocr_content +from .models import DocumentVersionParseError +from .permissions import permission_content_view, permission_parse_document +from .utils import get_document_content -class DocumentAllSubmitView(ConfirmView): - extra_context = {'title': _('Submit all documents for OCR?')} - - def get_post_action_redirect(self): - return reverse('common:tools_list') - - def view_action(self): - count = 0 - for document in Document.objects.all(): - document.submit_for_ocr() - count += 1 - - messages.success( - self.request, _('%d documents added to the OCR queue.') % count - ) - - -class DocumentSubmitView(ConfirmView): - def get_extra_context(self): - return { - 'object': self.get_object(), - 'title': _('Submit "%s" to the OCR queue?') % self.get_object() - } - - def get_object(self): - return Document.objects.get(pk=self.kwargs['pk']) - - def object_action(self, instance): - AccessControlList.objects.check_access( - permissions=permission_ocr_document, user=self.request.user, - obj=instance - ) - - instance.submit_for_ocr() - - def view_action(self): - instance = self.get_object() - - self.object_action(instance=instance) - - messages.success( - self.request, - _('Document: %(document)s was added to the OCR queue.') % { - 'document': instance - } - ) - - -class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView): - model = Document - success_message = '%(count)d document submitted to the OCR queue.' - success_message_plural = '%(count)d documents submitted to the OCR queue.' - - def get_extra_context(self): - # Override the base class method - return { - 'title': _('Submit the selected documents to the OCR queue?') - } - - -class DocumentTypeSubmitView(FormView): - form_class = DocumentTypeSelectForm - extra_context = { - 'title': _('Submit all documents of a type for OCR') - } - - def get_post_action_redirect(self): - return reverse('common:tools_list') - - def form_valid(self, form): - count = 0 - for document in form.cleaned_data['document_type'].documents.all(): - document.submit_for_ocr() - count += 1 - - messages.success( - self.request, _( - '%(count)d documents of type "%(document_type)s" added to the ' - 'OCR queue.' - ) % { - 'count': count, - 'document_type': form.cleaned_data['document_type'] - } - ) - - return HttpResponseRedirect(self.get_success_url()) - - -class DocumentTypeSettingsEditView(SingleObjectEditView): - fields = ('auto_ocr',) - view_permission = permission_document_type_ocr_setup - - def get_object(self, queryset=None): - return get_object_or_404( - DocumentType, pk=self.kwargs['pk'] - ).ocr_settings - - def get_extra_context(self): - return { - 'title': _( - 'Edit OCR settings for document type: %s' - ) % self.get_object().document_type - } - - -class DocumentOCRContent(SingleObjectDetailView): +class DocumentContentView(SingleObjectDetailView): form_class = DocumentContentForm model = Document - object_permission = permission_ocr_content_view + object_permission = permission_content_view def dispatch(self, request, *args, **kwargs): - result = super(DocumentOCRContent, self).dispatch( + result = super(DocumentContentView, self).dispatch( request, *args, **kwargs ) self.get_object().add_as_recent_document_for_user(request.user) @@ -145,23 +35,25 @@ class DocumentOCRContent(SingleObjectDetailView): 'document': self.get_object(), 'hide_labels': True, 'object': self.get_object(), - 'title': _('OCR result for document: %s') % self.get_object(), + 'title': _('Content for document: %s') % self.get_object(), } -class EntryListView(SingleObjectListView): - extra_context = { - 'hide_object': True, - 'title': _('OCR errors'), - } - view_permission = permission_ocr_document +class DocumentContentDownloadView(SingleObjectDownloadView): + model = Document + object_permission = permission_content_view - def get_object_list(self): - return DocumentVersionOCRError.objects.all() + def get_file(self): + file_object = DocumentContentDownloadView.TextIteratorIO( + iterator=get_document_content(document=self.get_object()) + ) + return DocumentContentDownloadView.VirtualFile( + file=file_object, name='{}-content'.format(self.get_object()) + ) -class DocumentOCRErrorsListView(SingleObjectListView): - view_permission = permission_ocr_document +class DocumentParsingErrorsListView(SingleObjectListView): + view_permission = permission_content_view def get_document(self): return get_object_or_404(Document, pk=self.kwargs['pk']) @@ -170,21 +62,93 @@ class DocumentOCRErrorsListView(SingleObjectListView): return { 'hide_object': True, 'object': self.get_document(), - 'title': _('OCR errors for document: %s') % self.get_document(), + 'title': _( + 'Parsing errors for document: %s' + ) % self.get_document(), } def get_object_list(self): - return self.get_document().latest_version.ocr_errors.all() + return self.get_document().latest_version.parsing_errors.all() -class DocumentOCRDownloadView(SingleObjectDownloadView): +class DocumentSubmitView(MultipleObjectConfirmActionView): model = Document - object_permission = permission_ocr_content_view + object_permission = permission_parse_document + success_message = _( + '%(count)d document added to the parsing queue' + ) + success_message_plural = _( + '%(count)d documents added to the parsing queue' + ) - def get_file(self): - file_object = DocumentOCRDownloadView.TextIteratorIO( - iterator=get_document_ocr_content(document=self.get_object()) - ) - return DocumentOCRDownloadView.VirtualFile( - file=file_object, name='{}-OCR'.format(self.get_object()) + def get_extra_context(self): + queryset = self.get_queryset() + + result = { + 'title': ungettext( + singular='Submit %(count)d document to the parsing queue?', + plural='Submit %(count)d documents to the parsing queue', + number=queryset.count() + ) % { + 'count': queryset.count(), + } + } + + if queryset.count() == 1: + result.update( + { + 'object': queryset.first(), + 'title': _( + 'Submit document "%s" to the parsing queue' + ) % queryset.first() + } + ) + + return result + + def object_action(self, instance, form=None): + instance.submit_for_parsing() + + +class DocumentTypeSubmitView(FormView): + form_class = DocumentTypeSelectForm + extra_context = { + 'title': _('Submit all documents of a type for parsing') + } + + def get_form_extra_kwargs(self): + return { + 'user': self.request.user + } + + def get_post_action_redirect(self): + return reverse('common:tools_list') + + def form_valid(self, form): + count = 0 + for document in form.cleaned_data['document_type'].documents.all(): + document.submit_for_parsing() + count += 1 + + messages.success( + self.request, _( + '%(count)d documents of type "%(document_type)s" added to the ' + 'parsing queue.' + ) % { + 'count': count, + 'document_type': form.cleaned_data['document_type'] + } ) + + return HttpResponseRedirect(self.get_success_url()) + + +class ParseErrorListView(SingleObjectListView): + extra_context = { + 'hide_object': True, + 'title': _('Parsing errors'), + } + view_permission = permission_content_view + + def get_object_list(self): + return DocumentVersionParseError.objects.all() diff --git a/mayan/apps/documents/migrations/0041_auto_20170823_1855.py b/mayan/apps/documents/migrations/0041_auto_20170823_1855.py new file mode 100644 index 0000000000..fcc01e0c6f --- /dev/null +++ b/mayan/apps/documents/migrations/0041_auto_20170823_1855.py @@ -0,0 +1,21 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.10.7 on 2017-08-23 18:55 +from __future__ import unicode_literals + +from django.db import migrations, models +import uuid + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0040_auto_20170725_1111'), + ] + + operations = [ + migrations.AlterField( + model_name='document', + name='uuid', + field=models.UUIDField(default=uuid.uuid4, editable=False), + ), + ]