OCR: Add 'ocr_content' attribute

Add the 'ocr_content' attribute to documents to allow access
to a document's OCR content for indexing and other purposes.

Fixes the OCR indexing failing test.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2018-11-27 05:20:31 -04:00
parent 0f5625a356
commit aaf9f7a8be
5 changed files with 20 additions and 10 deletions

View File

@@ -81,6 +81,8 @@
- Exposed a new Celery setting via the UI: CELERY_ALWAYS_EAGER.
- Settings with lazy values are now more carefully checked
and converteed before serializing them.
- Add the 'ocr_content' attribute to documents to allow access
to a document's OCR content for indexing and other purposes.
3.1.9 (2018-11-01)
==================

View File

@@ -7,6 +7,7 @@ from kombu import Exchange, Queue
from django.apps import apps
from django.db.models.signals import post_save
from django.utils.encoding import force_text
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
@@ -15,7 +16,7 @@ from common import (
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
menu_tools
)
from common.classes import ModelField
from common.classes import ModelAttribute, ModelField
from common.settings import settings_db_sync_task_delay
from documents.search import document_search, document_page_search
from documents.signals import post_version_upload
@@ -41,7 +42,7 @@ from .permissions import (
)
from .queues import * # NOQA
from .signals import post_document_version_ocr
from .utils import get_document_ocr_content
from .utils import document_property_ocr_content, get_document_ocr_content
logger = logging.getLogger(__name__)
@@ -94,13 +95,19 @@ class OCRApp(MayanAppConfig):
DocumentVersionOCRError = self.get_model('DocumentVersionOCRError')
Document.add_to_class('submit_for_ocr', document_ocr_submit)
DocumentVersion.add_to_class(
'ocr_content', get_document_ocr_content
Document.add_to_class(
'ocr_content', document_property_ocr_content
)
DocumentVersion.add_to_class(
'submit_for_ocr', document_version_ocr_submit
)
ModelAttribute(
model=Document, name='ocr_content', description=_(
'The OCR content of the document.'
)
)
ModelField(
Document, name='versions__pages__ocr_content__content'
)

View File

@@ -1,4 +1,4 @@
from __future__ import unicode_literals
TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.latest_version.ocr_content|join:" "|lower %}mayan{% endif %}'
TEST_OCR_INDEX_NODE_TEMPLATE = '{% if "mayan" in document.ocr_content.lower() %}mayan{% endif %}'
TEST_OCR_INDEX_NODE_TEMPLATE_LEVEL = 'mayan'

View File

@@ -33,7 +33,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
def test_submit_document_no_access(self):
response = self._request_document_ocr_submit_view()
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
def test_submit_document_with_access(self):
@@ -42,7 +41,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
)
response = self._request_document_ocr_submit_view()
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
def _request_document_version_ocr_submit_view(self):
@@ -54,7 +52,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
def test_submit_document_version_no_access(self):
response = self._request_document_version_ocr_submit_view()
self.assertEqual(response.status_code, status.HTTP_403_FORBIDDEN)
self.assertFalse(hasattr(self.document.pages.first(), 'ocr_content'))
def test_submit_document_version_with_access(self):
@@ -63,7 +60,6 @@ class OCRAPITestCase(DocumentTestMixin, BaseAPITestCase):
)
response = self._request_document_version_ocr_submit_view()
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
self.assertTrue(hasattr(self.document.pages.first(), 'ocr_content'))
def _request_document_page_content_view(self):

View File

@@ -13,6 +13,11 @@ def get_document_ocr_content(document):
try:
page_content = page.ocr_content.content
except DocumentPageOCRContent.DoesNotExist:
pass
yield ''
else:
yield force_text(page_content)
@property
def document_property_ocr_content(self):
return ' '.join(get_document_ocr_content(self))