Add support for downloading a document's OCR text.

Closes GitLab issue #215.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-07-01 01:07:23 -04:00
parent 610e10e85a
commit 916c3497c4
8 changed files with 165 additions and 9 deletions

View File

@@ -1,3 +1,8 @@
2.5 (2017-07-XX)
===============
- Add view to download a document's OCR text. GitLab #215
2.4 (2017-06-23)
================
- Add Django-mathfilters.
@@ -9,7 +14,7 @@
- Make tags, metadata types and cabinets searchable via the dynamic search API. GitLab issue #344.
- Add support for updating configuration options from environment variables.
- Add purgelocks management command. GitLab issue #221.
- Fix index rebuilding for multi value first levels. GitLab issue #391.
- Fix index rebuilding for multi value first levels. GitLab issue #391.
- Truncate views titles via the APPEARANCE_MAXIMUM_TITLE_LENGTH setting. GitLab issue #217.
- Add background task manager app. GitLab issue #132.
- Add link to show a document's OCR errors. GitLab issue #291.

84
docs/releases/2.5.rst Normal file
View File

@@ -0,0 +1,84 @@
=============================
Mayan EDMS v2.5 release notes
=============================
Released: July XX, 2017
What's new
==========
Other Changes
-------------
- Add view to download a document's OCR text. GitLab issue #215.
- Add missing OCR migration.
- Improve error output of the performupgrade command to debug upgrade errors
that could stop an upgrade (missing document files, etc).
- Enable the django-mathfilters app added in version 2.4.
- Do a complete pull and synchronization of the translations to fix missing
translations for Polish. Thanks to Wojtek Warczakowski for the report.
- Allow null for the SANE source resolution field. Even though the field was
marked as allowing blank values it was failing because it is a number field
and number fields need to allow explicit null values when left blank.
- Rename the mayan_task_manager app to task_manager.
- Make the task manager translatable.
- Add Turkish to the list of processes languages.
Removals
--------
* None
Upgrading from a previous version
---------------------------------
Using PIP
~~~~~~~~~
Type in the console::
$ pip install -U mayan-edms
the requirements will also be updated automatically.
Using Git
~~~~~~~~~
If you installed Mayan EDMS by cloning the Git repository issue the commands::
$ git reset --hard HEAD
$ git pull
otherwise download the compressed archived and uncompress it overriding the
existing installation.
Next upgrade/add the new requirements::
$ pip install --upgrade -r requirements.txt
Common steps
~~~~~~~~~~~~
Migrate existing database schema with::
$ mayan-edms.py performupgrade
Add new static media::
$ mayan-edms.py collectstatic --noinput
The upgrade procedure is now complete.
Backward incompatible changes
=============================
* None
Bugs fixed or issues closed
===========================
* `GitLab issue #215 <https://gitlab.com/mayan-edms/mayan-edms/issues/215>`_ Download text contents
.. _PyPI: https://pypi.python.org/pypi/mayan-edms/

View File

@@ -23,7 +23,8 @@ from rest_api.classes import APIEndPoint
from .handlers import initialize_new_ocr_settings, post_version_upload_ocr
from .links import (
link_document_content, link_document_ocr_erros_list, link_document_submit,
link_document_content, link_document_ocr_download,
link_document_ocr_erros_list, link_document_submit,
link_document_submit_all, link_document_submit_multiple,
link_document_type_ocr_settings, link_document_type_submit,
link_entry_list
@@ -130,9 +131,13 @@ class OCRApp(MayanAppConfig):
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
)
menu_secondary.bind_links(
links=(link_document_content, link_document_ocr_erros_list,),
links=(
link_document_content, link_document_ocr_erros_list,
link_document_ocr_download
),
sources=(
'ocr:document_content', 'ocr:document_ocr_error_list'
'ocr:document_content', 'ocr:document_ocr_error_list',
'ocr:document_ocr_download',
)
)
menu_secondary.bind_links(

View File

@@ -41,3 +41,8 @@ link_document_ocr_erros_list = Link(
permissions=(permission_ocr_content_view,), text=_('OCR errors'),
view='ocr:document_ocr_error_list'
)
link_document_ocr_download = Link(
args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Download OCR text'),
view='ocr:document_ocr_download'
)

View File

@@ -1,10 +1,12 @@
from __future__ import unicode_literals
from django.test import override_settings
from django.utils.encoding import force_text
from documents.tests.test_views import GenericDocumentViewTestCase
from ..permissions import permission_ocr_content_view
from ..utils import get_document_ocr_content
@override_settings(OCR_AUTO_OCR=True)
@@ -35,3 +37,26 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
self.assertContains(
response, 'Mayan EDMS Documentation', status_code=200
)
def test_document_ocr_download_view_no_permission(self):
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 403)
def test_document_download_view_with_permission(self):
self.expected_content_type = 'application/octet-stream; charset=utf-8'
self.grant(permission=permission_ocr_content_view)
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 200)
self.assert_download_response(
response, content=(
''.join(get_document_ocr_content(document=self.document))
),
)

View File

@@ -6,9 +6,9 @@ from .api_views import (
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
)
from .views import (
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRErrorsListView,
DocumentSubmitView, DocumentSubmitManyView, DocumentTypeSettingsEditView,
DocumentTypeSubmitView, EntryListView
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
)
urlpatterns = [
@@ -41,6 +41,10 @@ urlpatterns = [
r'^documents/(?P<pk>\d+)/ocr/errors/$',
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
),
url(
r'^documents/(?P<pk>\d+)/ocr/download/$',
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
),
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
]

14
mayan/apps/ocr/utils.py Normal file
View File

@@ -0,0 +1,14 @@
from __future__ import unicode_literals
from django.utils.encoding import force_unicode
from django.utils.html import conditional_escape
def get_document_ocr_content(document):
for page in document.pages.all():
try:
page_content = page.ocr_content.content
except DocumentPageContent.DoesNotExist:
pass
else:
yield conditional_escape(force_unicode(page_content))

View File

@@ -8,8 +8,8 @@ from django.utils.translation import ugettext_lazy as _
from acls.models import AccessControlList
from common.generics import (
ConfirmView, FormView, SingleObjectDetailView, SingleObjectEditView,
SingleObjectListView
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
SingleObjectEditView, SingleObjectListView
)
from common.mixins import MultipleInstanceActionMixin
from documents.models import Document, DocumentType
@@ -20,6 +20,7 @@ from .permissions import (
permission_ocr_content_view, permission_ocr_document,
permission_document_type_ocr_setup
)
from .utils import get_document_ocr_content
class DocumentAllSubmitView(ConfirmView):
@@ -174,3 +175,16 @@ class DocumentOCRErrorsListView(SingleObjectListView):
def get_queryset(self):
return self.get_document().latest_version.ocr_errors.all()
class DocumentOCRDownloadView(SingleObjectDownloadView):
model = Document
object_permission = permission_ocr_content_view
def get_file(self):
file_object = DocumentOCRDownloadView.TextIteratorIO(
iterator=get_document_ocr_content(document=self.get_object())
)
return DocumentOCRDownloadView.VirtualFile(
file=file_object, name='{}-OCR'.format(self.get_object())
)