diff --git a/HISTORY.rst b/HISTORY.rst index ceb595e2c4..abfaa3fc24 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -81,6 +81,8 @@ - Exposed a new Celery setting via the UI: CELERY_ALWAYS_EAGER. - Settings with lazy values are now more carefully checked and converteed before serializing them. +- Add the 'ocr_content' attribute to documents to allow access + to a document's OCR content for indexing and other purposes. 3.1.9 (2018-11-01) ================== diff --git a/mayan/apps/ocr/apps.py b/mayan/apps/ocr/apps.py index e1d21110b1..c1312ae829 100644 --- a/mayan/apps/ocr/apps.py +++ b/mayan/apps/ocr/apps.py @@ -7,6 +7,7 @@ from kombu import Exchange, Queue from django.apps import apps from django.db.models.signals import post_save +from django.utils.encoding import force_text from django.utils.timezone import now from django.utils.translation import ugettext_lazy as _ @@ -15,7 +16,7 @@ from common import ( MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary, menu_tools ) -from common.classes import ModelField +from common.classes import ModelAttribute, ModelField from common.settings import settings_db_sync_task_delay from documents.search import document_search, document_page_search from documents.signals import post_version_upload @@ -41,7 +42,7 @@ from .permissions import ( ) from .queues import * # NOQA from .signals import post_document_version_ocr -from .utils import get_document_ocr_content +from .utils import document_property_ocr_content, get_document_ocr_content logger = logging.getLogger(__name__) @@ -94,13 +95,19 @@ class OCRApp(MayanAppConfig): DocumentVersionOCRError = self.get_model('DocumentVersionOCRError') Document.add_to_class('submit_for_ocr', document_ocr_submit) - DocumentVersion.add_to_class( - 'ocr_content', get_document_ocr_content + Document.add_to_class( + 'ocr_content', document_property_ocr_content ) DocumentVersion.add_to_class( 'submit_for_ocr', document_version_ocr_submit ) + ModelAttribute( + model=Document, name='ocr_content', description=_( + 'The OCR content of the document.' + ) + ) + ModelField( Document, name='versions__pages__ocr_content__content' ) diff --git a/mayan/apps/ocr/tests/literals.py b/mayan/apps/ocr/tests/literals.py index 6837767bd3..155288f064 100644 --- a/mayan/apps/ocr/tests/literals.py +++ b/mayan/apps/ocr/tests/literals.py @@ -1,4 +1,4 @@ from __future__ import unicode_literals -TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.latest_version.ocr_content|join:" "|lower %}mayan{% endif %}' +TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.ocr_content.lower() %}mayan{% endif %}' TEST_OCR_INDEX_NODE_TEMPLATE_LEVEL = 'mayan' diff --git a/mayan/apps/ocr/tests/test_api.py b/mayan/apps/ocr/tests/test_api.py index c3aadceac3..5c3c0bd286 100644 --- a/mayan/apps/ocr/tests/test_api.py +++ b/mayan/apps/ocr/tests/test_api.py @@ -33,7 +33,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase): def test_submit_document_no_access(self): response = self._request_document_ocr_submit_view() self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content')) def test_submit_document_with_access(self): @@ -42,7 +41,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase): ) response = self._request_document_ocr_submit_view() self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) - self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content')) def _request_document_version_ocr_submit_view(self): @@ -54,7 +52,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase): def test_submit_document_version_no_access(self): response = self._request_document_version_ocr_submit_view() self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN) - self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content')) def test_submit_document_version_with_access(self): @@ -63,7 +60,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase): ) response = self._request_document_version_ocr_submit_view() self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED) - self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content')) def _request_document_page_content_view(self): diff --git a/mayan/apps/ocr/utils.py b/mayan/apps/ocr/utils.py index 0ac5de5fb3..304e497da1 100644 --- a/mayan/apps/ocr/utils.py +++ b/mayan/apps/ocr/utils.py @@ -13,6 +13,11 @@ def get_document_ocr_content(document): try: page_content = page.ocr_content.content except DocumentPageOCRContent.DoesNotExist: - pass + yield '' else: yield force_text(page_content) + + +@property +def document_property_ocr_content(self): + return ' '.join(get_document_ocr_content(self))