From 916c3497c452f96d6d9bcfaecaaa101093054730 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sat, 1 Jul 2017 01:07:23 -0400 Subject: [PATCH] Add support for downloading a document's OCR text. Closes GitLab issue #215. Signed-off-by: Roberto Rosario --- HISTORY.rst | 7 ++- docs/releases/2.5.rst | 84 ++++++++++++++++++++++++++++++ mayan/apps/ocr/apps.py | 11 ++-- mayan/apps/ocr/links.py | 5 ++ mayan/apps/ocr/tests/test_views.py | 25 +++++++++ mayan/apps/ocr/urls.py | 10 ++-- mayan/apps/ocr/utils.py | 14 +++++ mayan/apps/ocr/views.py | 18 ++++++- 8 files changed, 165 insertions(+), 9 deletions(-) create mode 100644 docs/releases/2.5.rst create mode 100644 mayan/apps/ocr/utils.py diff --git a/HISTORY.rst b/HISTORY.rst index 78aa249379..01a6085c52 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -1,3 +1,8 @@ +2.5 (2017-07-XX) +=============== +- Add view to download a document's OCR text. GitLab #215 + + 2.4 (2017-06-23) ================ - Add Django-mathfilters. @@ -9,7 +14,7 @@ - Make tags, metadata types and cabinets searchable via the dynamic search API. GitLab issue #344. - Add support for updating configuration options from environment variables. - Add purgelocks management command. GitLab issue #221. -- Fix index rebuilding for multi value first levels. GitLab issue #391. +- Fix index rebuilding for multi value first levels. GitLab issue #391. - Truncate views titles via the APPEARANCE_MAXIMUM_TITLE_LENGTH setting. GitLab issue #217. - Add background task manager app. GitLab issue #132. - Add link to show a document's OCR errors. GitLab issue #291. diff --git a/docs/releases/2.5.rst b/docs/releases/2.5.rst new file mode 100644 index 0000000000..3ae51f6364 --- /dev/null +++ b/docs/releases/2.5.rst @@ -0,0 +1,84 @@ +============================= +Mayan EDMS v2.5 release notes +============================= + +Released: July XX, 2017 + +What's new +========== + + +Other Changes +------------- +- Add view to download a document's OCR text. GitLab issue #215. +- Add missing OCR migration. +- Improve error output of the performupgrade command to debug upgrade errors + that could stop an upgrade (missing document files, etc). +- Enable the django-mathfilters app added in version 2.4. +- Do a complete pull and synchronization of the translations to fix missing + translations for Polish. Thanks to Wojtek Warczakowski for the report. +- Allow null for the SANE source resolution field. Even though the field was + marked as allowing blank values it was failing because it is a number field + and number fields need to allow explicit null values when left blank. +- Rename the mayan_task_manager app to task_manager. +- Make the task manager translatable. +- Add Turkish to the list of processes languages. + + +Removals +-------- +* None + +Upgrading from a previous version +--------------------------------- + +Using PIP +~~~~~~~~~ + +Type in the console:: + + $ pip install -U mayan-edms + +the requirements will also be updated automatically. + +Using Git +~~~~~~~~~ + +If you installed Mayan EDMS by cloning the Git repository issue the commands:: + + $ git reset --hard HEAD + $ git pull + +otherwise download the compressed archived and uncompress it overriding the +existing installation. + +Next upgrade/add the new requirements:: + + $ pip install --upgrade -r requirements.txt + +Common steps +~~~~~~~~~~~~ + +Migrate existing database schema with:: + + $ mayan-edms.py performupgrade + +Add new static media:: + + $ mayan-edms.py collectstatic --noinput + +The upgrade procedure is now complete. + + +Backward incompatible changes +============================= + +* None + +Bugs fixed or issues closed +=========================== + +* `GitLab issue #215 `_ Download text contents + + +.. _PyPI: https://pypi.python.org/pypi/mayan-edms/ diff --git a/mayan/apps/ocr/apps.py b/mayan/apps/ocr/apps.py index f584c59d18..a9bccc8fe4 100644 --- a/mayan/apps/ocr/apps.py +++ b/mayan/apps/ocr/apps.py @@ -23,7 +23,8 @@ from rest_api.classes import APIEndPoint from .handlers import initialize_new_ocr_settings, post_version_upload_ocr from .links import ( - link_document_content, link_document_ocr_erros_list, link_document_submit, + link_document_content, link_document_ocr_download, + link_document_ocr_erros_list, link_document_submit, link_document_submit_all, link_document_submit_multiple, link_document_type_ocr_settings, link_document_type_submit, link_entry_list @@ -130,9 +131,13 @@ class OCRApp(MayanAppConfig): links=(link_document_type_ocr_settings,), sources=(DocumentType,) ) menu_secondary.bind_links( - links=(link_document_content, link_document_ocr_erros_list,), + links=( + link_document_content, link_document_ocr_erros_list, + link_document_ocr_download + ), sources=( - 'ocr:document_content', 'ocr:document_ocr_error_list' + 'ocr:document_content', 'ocr:document_ocr_error_list', + 'ocr:document_ocr_download', ) ) menu_secondary.bind_links( diff --git a/mayan/apps/ocr/links.py b/mayan/apps/ocr/links.py index 124fafbc79..edbe67d0d6 100644 --- a/mayan/apps/ocr/links.py +++ b/mayan/apps/ocr/links.py @@ -41,3 +41,8 @@ link_document_ocr_erros_list = Link( permissions=(permission_ocr_content_view,), text=_('OCR errors'), view='ocr:document_ocr_error_list' ) +link_document_ocr_download = Link( + args='resolved_object.id', icon='fa fa-file-text-o', + permissions=(permission_ocr_content_view,), text=_('Download OCR text'), + view='ocr:document_ocr_download' +) diff --git a/mayan/apps/ocr/tests/test_views.py b/mayan/apps/ocr/tests/test_views.py index 4a05bc38e0..f8b258584c 100644 --- a/mayan/apps/ocr/tests/test_views.py +++ b/mayan/apps/ocr/tests/test_views.py @@ -1,10 +1,12 @@ from __future__ import unicode_literals from django.test import override_settings +from django.utils.encoding import force_text from documents.tests.test_views import GenericDocumentViewTestCase from ..permissions import permission_ocr_content_view +from ..utils import get_document_ocr_content @override_settings(OCR_AUTO_OCR=True) @@ -35,3 +37,26 @@ class OCRViewsTestCase(GenericDocumentViewTestCase): self.assertContains( response, 'Mayan EDMS Documentation', status_code=200 ) + + def test_document_ocr_download_view_no_permission(self): + response = self.get( + 'ocr:document_ocr_download', args=(self.document.pk,) + ) + + self.assertEqual(response.status_code, 403) + + def test_document_download_view_with_permission(self): + self.expected_content_type = 'application/octet-stream; charset=utf-8' + + self.grant(permission=permission_ocr_content_view) + response = self.get( + 'ocr:document_ocr_download', args=(self.document.pk,) + ) + + self.assertEqual(response.status_code, 200) + + self.assert_download_response( + response, content=( + ''.join(get_document_ocr_content(document=self.document)) + ), + ) diff --git a/mayan/apps/ocr/urls.py b/mayan/apps/ocr/urls.py index be1ea854f6..30f1b59359 100644 --- a/mayan/apps/ocr/urls.py +++ b/mayan/apps/ocr/urls.py @@ -6,9 +6,9 @@ from .api_views import ( APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView ) from .views import ( - DocumentAllSubmitView, DocumentOCRContent, DocumentOCRErrorsListView, - DocumentSubmitView, DocumentSubmitManyView, DocumentTypeSettingsEditView, - DocumentTypeSubmitView, EntryListView + DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView, + DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView, + DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView ) urlpatterns = [ @@ -41,6 +41,10 @@ urlpatterns = [ r'^documents/(?P\d+)/ocr/errors/$', DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list' ), + url( + r'^documents/(?P\d+)/ocr/download/$', + DocumentOCRDownloadView.as_view(), name='document_ocr_download' + ), url(r'^all/$', EntryListView.as_view(), name='entry_list'), ] diff --git a/mayan/apps/ocr/utils.py b/mayan/apps/ocr/utils.py new file mode 100644 index 0000000000..809ae16b83 --- /dev/null +++ b/mayan/apps/ocr/utils.py @@ -0,0 +1,14 @@ +from __future__ import unicode_literals + +from django.utils.encoding import force_unicode +from django.utils.html import conditional_escape + + +def get_document_ocr_content(document): + for page in document.pages.all(): + try: + page_content = page.ocr_content.content + except DocumentPageContent.DoesNotExist: + pass + else: + yield conditional_escape(force_unicode(page_content)) diff --git a/mayan/apps/ocr/views.py b/mayan/apps/ocr/views.py index 2f4e17e3fe..29b3ecc3d3 100644 --- a/mayan/apps/ocr/views.py +++ b/mayan/apps/ocr/views.py @@ -8,8 +8,8 @@ from django.utils.translation import ugettext_lazy as _ from acls.models import AccessControlList from common.generics import ( - ConfirmView, FormView, SingleObjectDetailView, SingleObjectEditView, - SingleObjectListView + ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView, + SingleObjectEditView, SingleObjectListView ) from common.mixins import MultipleInstanceActionMixin from documents.models import Document, DocumentType @@ -20,6 +20,7 @@ from .permissions import ( permission_ocr_content_view, permission_ocr_document, permission_document_type_ocr_setup ) +from .utils import get_document_ocr_content class DocumentAllSubmitView(ConfirmView): @@ -174,3 +175,16 @@ class DocumentOCRErrorsListView(SingleObjectListView): def get_queryset(self): return self.get_document().latest_version.ocr_errors.all() + + +class DocumentOCRDownloadView(SingleObjectDownloadView): + model = Document + object_permission = permission_ocr_content_view + + def get_file(self): + file_object = DocumentOCRDownloadView.TextIteratorIO( + iterator=get_document_ocr_content(document=self.get_object()) + ) + return DocumentOCRDownloadView.VirtualFile( + file=file_object, name='{}-OCR'.format(self.get_object()) + )