OCR: Add 'ocr_content' attribute
Add the 'ocr_content' attribute to documents to allow access to a document's OCR content for indexing and other purposes. Fixes the OCR indexing failing test. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -81,6 +81,8 @@
|
||||
- Exposed a new Celery setting via the UI: CELERY_ALWAYS_EAGER.
|
||||
- Settings with lazy values are now more carefully checked
|
||||
and converteed before serializing them.
|
||||
- Add the 'ocr_content' attribute to documents to allow access
|
||||
to a document's OCR content for indexing and other purposes.
|
||||
|
||||
3.1.9 (2018-11-01)
|
||||
==================
|
||||
|
||||
@@ -7,6 +7,7 @@ from kombu import Exchange, Queue
|
||||
|
||||
from django.apps import apps
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils.encoding import force_text
|
||||
from django.utils.timezone import now
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
@@ -15,7 +16,7 @@ from common import (
|
||||
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
|
||||
menu_tools
|
||||
)
|
||||
from common.classes import ModelField
|
||||
from common.classes import ModelAttribute, ModelField
|
||||
from common.settings import settings_db_sync_task_delay
|
||||
from documents.search import document_search, document_page_search
|
||||
from documents.signals import post_version_upload
|
||||
@@ -41,7 +42,7 @@ from .permissions import (
|
||||
)
|
||||
from .queues import * # NOQA
|
||||
from .signals import post_document_version_ocr
|
||||
from .utils import get_document_ocr_content
|
||||
from .utils import document_property_ocr_content, get_document_ocr_content
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
@@ -94,13 +95,19 @@ class OCRApp(MayanAppConfig):
|
||||
DocumentVersionOCRError = self.get_model('DocumentVersionOCRError')
|
||||
|
||||
Document.add_to_class('submit_for_ocr', document_ocr_submit)
|
||||
DocumentVersion.add_to_class(
|
||||
'ocr_content', get_document_ocr_content
|
||||
Document.add_to_class(
|
||||
'ocr_content', document_property_ocr_content
|
||||
)
|
||||
DocumentVersion.add_to_class(
|
||||
'submit_for_ocr', document_version_ocr_submit
|
||||
)
|
||||
|
||||
ModelAttribute(
|
||||
model=Document, name='ocr_content', description=_(
|
||||
'The OCR content of the document.'
|
||||
)
|
||||
)
|
||||
|
||||
ModelField(
|
||||
Document, name='versions__pages__ocr_content__content'
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.latest_version.ocr_content|join:" "|lower %}mayan{% endif %}'
|
||||
TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.ocr_content.lower() %}mayan{% endif %}'
|
||||
TEST_OCR_INDEX_NODE_TEMPLATE_LEVEL = 'mayan'
|
||||
|
||||
@@ -33,7 +33,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
||||
def test_submit_document_no_access(self):
|
||||
response = self._request_document_ocr_submit_view()
|
||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||
|
||||
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||
|
||||
def test_submit_document_with_access(self):
|
||||
@@ -42,7 +41,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
||||
)
|
||||
response = self._request_document_ocr_submit_view()
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||
|
||||
def _request_document_version_ocr_submit_view(self):
|
||||
@@ -54,7 +52,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
||||
def test_submit_document_version_no_access(self):
|
||||
response = self._request_document_version_ocr_submit_view()
|
||||
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
|
||||
|
||||
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||
|
||||
def test_submit_document_version_with_access(self):
|
||||
@@ -63,7 +60,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
|
||||
)
|
||||
response = self._request_document_version_ocr_submit_view()
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
|
||||
|
||||
def _request_document_page_content_view(self):
|
||||
|
||||
@@ -13,6 +13,11 @@ def get_document_ocr_content(document):
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageOCRContent.DoesNotExist:
|
||||
pass
|
||||
yield ''
|
||||
else:
|
||||
yield force_text(page_content)
|
||||
|
||||
|
||||
@property
|
||||
def document_property_ocr_content(self):
|
||||
return ' '.join(get_document_ocr_content(self))
|
||||
|
||||
Reference in New Issue
Block a user