OCR: Add 'ocr_content' attribute
Add the 'ocr_content' attribute to documents to allow access to a document's OCR content for indexing and other purposes. Fixes the OCR indexing failing test. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -81,6 +81,8 @@
|
|||||||
- Exposed a new Celery setting via the UI: CELERY_ALWAYS_EAGER.
|
- Exposed a new Celery setting via the UI: CELERY_ALWAYS_EAGER.
|
||||||
- Settings with lazy values are now more carefully checked
|
- Settings with lazy values are now more carefully checked
|
||||||
and converteed before serializing them.
|
and converteed before serializing them.
|
||||||
|
- Add the 'ocr_content' attribute to documents to allow access
|
||||||
|
to a document's OCR content for indexing and other purposes.
|
||||||
|
|
||||||
3.1.9 (2018-11-01)
|
3.1.9 (2018-11-01)
|
||||||
==================
|
==================
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ from kombu import Exchange, Queue
|
|||||||
|
|
||||||
from django.apps import apps
|
from django.apps import apps
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
|
from django.utils.encoding import force_text
|
||||||
from django.utils.timezone import now
|
from django.utils.timezone import now
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
@@ -15,7 +16,7 @@ from common import (
|
|||||||
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
|
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
|
||||||
menu_tools
|
menu_tools
|
||||||
)
|
)
|
||||||
from common.classes import ModelField
|
from common.classes import ModelAttribute, ModelField
|
||||||
from common.settings import settings_db_sync_task_delay
|
from common.settings import settings_db_sync_task_delay
|
||||||
from documents.search import document_search, document_page_search
|
from documents.search import document_search, document_page_search
|
||||||
from documents.signals import post_version_upload
|
from documents.signals import post_version_upload
|
||||||
@@ -41,7 +42,7 @@ from .permissions import (
|
|||||||
)
|
)
|
||||||
from .queues import * # NOQA
|
from .queues import * # NOQA
|
||||||
from .signals import post_document_version_ocr
|
from .signals import post_document_version_ocr
|
||||||
from .utils import get_document_ocr_content
|
from .utils import document_property_ocr_content, get_document_ocr_content
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -94,13 +95,19 @@ class OCRApp(MayanAppConfig):
|
|||||||
DocumentVersionOCRError = self.get_model('DocumentVersionOCRError')
|
DocumentVersionOCRError = self.get_model('DocumentVersionOCRError')
|
||||||
|
|
||||||
Document.add_to_class('submit_for_ocr', document_ocr_submit)
|
Document.add_to_class('submit_for_ocr', document_ocr_submit)
|
||||||
DocumentVersion.add_to_class(
|
Document.add_to_class(
|
||||||
'ocr_content', get_document_ocr_content
|
'ocr_content', document_property_ocr_content
|
||||||
)
|
)
|
||||||
DocumentVersion.add_to_class(
|
DocumentVersion.add_to_class(
|
||||||
'submit_for_ocr', document_version_ocr_submit
|
'submit_for_ocr', document_version_ocr_submit
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ModelAttribute(
|
||||||
|
model=Document, name='ocr_content', description=_(
|
||||||
|
'The OCR content of the document.'
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
ModelField(
|
ModelField(
|
||||||
Document, name='versions__pages__ocr_content__content'
|
Document, name='versions__pages__ocr_content__content'
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.latest_version.ocr_content|join:" "|lower %}mayan{% endif %}'
|
TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.ocr_content.lower() %}mayan{% endif %}'
|
||||||
TEST_OCR_INDEX_NODE_TEMPLATE_LEVEL = 'mayan'
|
TEST_OCR_INDEX_NODE_TEMPLATE_LEVEL = 'mayan'
|
||||||
|
|||||||
@@ -33,7 +33,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
|||||||
def test_submit_document_no_access(self):
|
def test_submit_document_no_access(self):
|
||||||
response = self._request_document_ocr_submit_view()
|
response = self._request_document_ocr_submit_view()
|
||||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
|
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||||
|
|
||||||
def test_submit_document_with_access(self):
|
def test_submit_document_with_access(self):
|
||||||
@@ -42,7 +41,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
|||||||
)
|
)
|
||||||
response = self._request_document_ocr_submit_view()
|
response = self._request_document_ocr_submit_view()
|
||||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
|
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||||
|
|
||||||
def _request_document_version_ocr_submit_view(self):
|
def _request_document_version_ocr_submit_view(self):
|
||||||
@@ -54,7 +52,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
|||||||
def test_submit_document_version_no_access(self):
|
def test_submit_document_version_no_access(self):
|
||||||
response = self._request_document_version_ocr_submit_view()
|
response = self._request_document_version_ocr_submit_view()
|
||||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||||
|
|
||||||
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
|
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||||
|
|
||||||
def test_submit_document_version_with_access(self):
|
def test_submit_document_version_with_access(self):
|
||||||
@@ -63,7 +60,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
|||||||
)
|
)
|
||||||
response = self._request_document_version_ocr_submit_view()
|
response = self._request_document_version_ocr_submit_view()
|
||||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
|
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||||
|
|
||||||
def _request_document_page_content_view(self):
|
def _request_document_page_content_view(self):
|
||||||
|
|||||||
@@ -13,6 +13,11 @@ def get_document_ocr_content(document):
|
|||||||
try:
|
try:
|
||||||
page_content = page.ocr_content.content
|
page_content = page.ocr_content.content
|
||||||
except DocumentPageOCRContent.DoesNotExist:
|
except DocumentPageOCRContent.DoesNotExist:
|
||||||
pass
|
yield ''
|
||||||
else:
|
else:
|
||||||
yield force_text(page_content)
|
yield force_text(page_content)
|
||||||
|
|
||||||
|
|
||||||
|
@property
|
||||||
|
def document_property_ocr_content(self):
|
||||||
|
return ' '.join(get_document_ocr_content(self))
|
||||||
|
|||||||
Reference in New Issue
Block a user