diff --git a/mayan/apps/document_parsing/apps.py b/mayan/apps/document_parsing/apps.py index 7def216213..504e5efc3d 100644 --- a/mayan/apps/document_parsing/apps.py +++ b/mayan/apps/document_parsing/apps.py @@ -45,7 +45,7 @@ from .permissions import ( permission_parse_document ) from .signals import post_document_version_parsing -from .utils import get_document_content +from .utils import get_document_content, get_document_version_content logger = logging.getLogger(__name__) @@ -65,7 +65,7 @@ class DocumentParsingApp(MayanAppConfig): app_label='documents', model_name='Document' ) DocumentPage = apps.get_model( - app_label='documents', model_name='DocumentVersionPage' + app_label='documents', model_name='DocumentPage' ) DocumentType = apps.get_model( app_label='documents', model_name='DocumentType' @@ -76,6 +76,9 @@ class DocumentParsingApp(MayanAppConfig): DocumentVersion = apps.get_model( app_label='documents', model_name='DocumentVersion' ) + DocumentVersionPage = apps.get_model( + app_label='documents', model_name='DocumentVersionPage' + ) DocumentVersionParseError = self.get_model( model_name='DocumentVersionParseError' ) @@ -87,7 +90,7 @@ class DocumentParsingApp(MayanAppConfig): name='content', value=get_document_content ) DocumentVersion.add_to_class( - name='content', value=get_document_content + name='content', value=get_document_version_content ) DocumentVersion.add_to_class( name='submit_for_parsing', diff --git a/mayan/apps/document_parsing/forms.py b/mayan/apps/document_parsing/forms.py index 12778c6acb..2803c9cd59 100644 --- a/mayan/apps/document_parsing/forms.py +++ b/mayan/apps/document_parsing/forms.py @@ -26,9 +26,9 @@ class DocumentContentForm(forms.Form): except AttributeError: document_pages = [] - for page in document_pages: + for document_page in document_pages: try: - page_content = page.content.content + page_content = document_page.content_object.content.content except DocumentVersionPageContent.DoesNotExist: pass else: @@ -37,7 +37,7 @@ class DocumentContentForm(forms.Form): '\n\n\n
- %s -

\n\n\n' % ( ugettext( 'Page %(page_number)d' - ) % {'page_number': page.page_number} + ) % {'page_number': document_page.page_number} ) ) @@ -72,7 +72,7 @@ class DocumentPageContentForm(forms.Form): self.fields['contents'].initial = '' try: - page_content = document_page.content.content + page_content = document_page.content_object.content.content except DocumentVersionPageContent.DoesNotExist: pass else: diff --git a/mayan/apps/document_parsing/managers.py b/mayan/apps/document_parsing/managers.py index 1669f89e70..7748596f2e 100644 --- a/mayan/apps/document_parsing/managers.py +++ b/mayan/apps/document_parsing/managers.py @@ -22,7 +22,9 @@ class DocumentPageContentManager(models.Manager): def delete_content_for(self, document, user=None): with transaction.atomic(): for document_page in document.pages.all(): - self.filter(document_page=document_page).delete() + self.filter( + document_version_page=document_page.content_object + ).delete() event_parsing_document_content_deleted.commit( actor=user, target=document diff --git a/mayan/apps/document_parsing/models.py b/mayan/apps/document_parsing/models.py index 9c7483f448..05811b63c1 100644 --- a/mayan/apps/document_parsing/models.py +++ b/mayan/apps/document_parsing/models.py @@ -11,32 +11,6 @@ from mayan.apps.documents.models import ( from .managers import DocumentPageContentManager, DocumentTypeSettingsManager -@python_2_unicode_compatible -class DocumentVersionPageContent(models.Model): - """ - This model store's the parsed content of a document page. - """ - document_version_page = models.OneToOneField( - on_delete=models.CASCADE, related_name='content', - to=DocumentVersionPage, verbose_name=_('Document version page') - ) - content = models.TextField( - blank=True, help_text=_( - 'The actual text content as extracted by the document ' - 'parsing backend.' - ), verbose_name=_('Content') - ) - - objects = DocumentPageContentManager() - - class Meta: - verbose_name = _('Document version page content') - verbose_name_plural = _('Document version pages contents') - - def __str__(self): - return force_text(self.document_page) - - class DocumentTypeSettings(models.Model): """ This model stores the parsing settings for a document type. @@ -62,6 +36,32 @@ class DocumentTypeSettings(models.Model): verbose_name_plural = _('Document types settings') +@python_2_unicode_compatible +class DocumentVersionPageContent(models.Model): + """ + This model store's the parsed content of a document page. + """ + document_version_page = models.OneToOneField( + on_delete=models.CASCADE, related_name='content', + to=DocumentVersionPage, verbose_name=_('Document version page') + ) + content = models.TextField( + blank=True, help_text=_( + 'The actual text content as extracted by the document ' + 'parsing backend.' + ), verbose_name=_('Content') + ) + + objects = DocumentPageContentManager() + + class Meta: + verbose_name = _('Document version page content') + verbose_name_plural = _('Document version pages contents') + + def __str__(self): + return force_text(self.document_page) + + @python_2_unicode_compatible class DocumentVersionParseError(models.Model): """ diff --git a/mayan/apps/document_parsing/parsers.py b/mayan/apps/document_parsing/parsers.py index 8ad24b4115..469b974377 100644 --- a/mayan/apps/document_parsing/parsers.py +++ b/mayan/apps/document_parsing/parsers.py @@ -23,11 +23,13 @@ class Parser(object): _registry = {} @classmethod - def parse_document_page(cls, document_page): - for parser_class in cls._registry.get(document_page.document_version.mimetype, ()): + def parse_document_version_page(cls, document_version_page): + for parser_class in cls._registry.get(document_version_page.document_version.mimetype, ()): try: parser = parser_class() - parser.process_document_page(document_page) + parser.process_document_page( + document_version_page=document_version_page + ) except ParserError: # If parser raises error, try next parser in the list pass @@ -41,7 +43,9 @@ class Parser(object): for parser_class in cls._registry.get(document_version.mimetype, ()): try: parser = parser_class() - parser.process_document_version(document_version) + parser.process_document_version( + document_version=document_version + ) except ParserError: # If parser raises error, try next parser in the list pass @@ -64,10 +68,12 @@ class Parser(object): ) logger.debug('document version: %d', document_version.pk) - for document_page in document_version.pages.all(): - self.process_document_page(document_page=document_page) + for document_version_page in document_version.pages.all(): + self.process_document_version_page( + document_version_page=document_version_page + ) - def process_document_page(self, document_page): + def process_document_version_page(self, document_version_page): DocumentVersionPageContent = apps.get_model( app_label='document_parsing', model_name='DocumentVersionPageContent' @@ -75,19 +81,20 @@ class Parser(object): logger.info( 'Processing page: %d of document version: %s', - document_page.page_number, document_page.document_version + document_version_page.page_number, + document_version_page.document_version ) - file_object = document_page.document_version.get_intermediate_file() + file_object = document_version_page.document_version.get_intermediate_file() try: - document_page_content, created = DocumentVersionPageContent.objects.get_or_create( - document_page=document_page + document_version_page_content, created = DocumentVersionPageContent.objects.get_or_create( + document_version_page=document_version_page ) - document_page_content.content = self.execute( - file_object=file_object, page_number=document_page.page_number + document_version_page_content.content = self.execute( + file_object=file_object, page_number=document_version_page.page_number ) - document_page_content.save() + document_version_page_content.save() except Exception as exception: error_message = _('Exception parsing page; %s') % exception logger.error(error_message) @@ -97,7 +104,8 @@ class Parser(object): logger.info( 'Finished processing page: %d of document version: %s', - document_page.page_number, document_page.document_version + document_version_page.page_number, + document_version_page.document_version ) def execute(self, file_object, page_number): diff --git a/mayan/apps/document_parsing/tasks.py b/mayan/apps/document_parsing/tasks.py index 4debffbc60..653552f741 100644 --- a/mayan/apps/document_parsing/tasks.py +++ b/mayan/apps/document_parsing/tasks.py @@ -14,8 +14,8 @@ def task_parse_document_version(document_version_pk): DocumentVersion = apps.get_model( app_label='documents', model_name='DocumentVersion' ) - DocumentPageContent = apps.get_model( - app_label='document_parsing', model_name='DocumentPageContent' + DocumentVersionPageContent = apps.get_model( + app_label='document_parsing', model_name='DocumentVersionPageContent' ) document_version = DocumentVersion.objects.get( @@ -24,6 +24,6 @@ def task_parse_document_version(document_version_pk): logger.info( 'Starting parsing for document version: %s', document_version ) - DocumentPageContent.objects.process_document_version( + DocumentVersionPageContent.objects.process_document_version( document_version=document_version ) diff --git a/mayan/apps/document_parsing/tests/test_events.py b/mayan/apps/document_parsing/tests/test_events.py index 1a2860305d..17bbe198fb 100644 --- a/mayan/apps/document_parsing/tests/test_events.py +++ b/mayan/apps/document_parsing/tests/test_events.py @@ -10,7 +10,7 @@ from ..events import ( event_parsing_document_version_submit, event_parsing_document_version_finish ) -from ..models import DocumentPageContent +from ..models import DocumentVersionPageContent class DocumentParsingEventsTestCase(GenericDocumentTestCase): @@ -19,7 +19,7 @@ class DocumentParsingEventsTestCase(GenericDocumentTestCase): def test_document_content_deleted_event(self): Action.objects.all().delete() - DocumentPageContent.objects.delete_content_for( + DocumentVersionPageContent.objects.delete_content_for( document=self.test_document ) diff --git a/mayan/apps/document_parsing/tests/test_parsers.py b/mayan/apps/document_parsing/tests/test_parsers.py index 237bccc567..2eeb8703a6 100644 --- a/mayan/apps/document_parsing/tests/test_parsers.py +++ b/mayan/apps/document_parsing/tests/test_parsers.py @@ -18,5 +18,5 @@ class ParserTestCase(DocumentTestMixin, BaseTestCase): parser.process_document_version(self.test_document.latest_version) self.assertTrue( - TEST_DOCUMENT_CONTENT in self.test_document.pages.first().content.content + TEST_DOCUMENT_CONTENT in self.test_document.pages.first().content_object.content.content ) diff --git a/mayan/apps/document_parsing/tests/test_views.py b/mayan/apps/document_parsing/tests/test_views.py index bb88c1817b..21ca0ef808 100644 --- a/mayan/apps/document_parsing/tests/test_views.py +++ b/mayan/apps/document_parsing/tests/test_views.py @@ -5,7 +5,7 @@ from django.test import override_settings from mayan.apps.documents.tests.base import GenericDocumentViewTestCase from mayan.apps.documents.tests.literals import TEST_HYBRID_DOCUMENT -from ..models import DocumentPageContent +from ..models import DocumentVersionPageContent from ..permissions import ( permission_content_view, permission_document_type_parsing_setup, permission_parse_document @@ -72,8 +72,8 @@ class DocumentContentViewsTestCase( self.assertEqual(response.status_code, 404) self.assertTrue( - DocumentPageContent.objects.filter( - document_page=self.test_document.pages.first() + DocumentVersionPageContent.objects.filter( + document_version_page=self.test_document.pages.first().content_object ).exists() ) @@ -86,8 +86,8 @@ class DocumentContentViewsTestCase( self.assertEqual(response.status_code, 302) self.assertFalse( - DocumentPageContent.objects.filter( - document_page=self.test_document.pages.first() + DocumentVersionPageContent.objects.filter( + document_version_page=self.test_document.pages.first().content_object ).exists() ) diff --git a/mayan/apps/document_parsing/utils.py b/mayan/apps/document_parsing/utils.py index ab8e049450..5086d5cf02 100644 --- a/mayan/apps/document_parsing/utils.py +++ b/mayan/apps/document_parsing/utils.py @@ -6,14 +6,28 @@ from django.utils.html import conditional_escape def get_document_content(document): - DocumentPageContent = apps.get_model( - app_label='document_parsing', model_name='DocumentPageContent' + DocumentVersionPageContent = apps.get_model( + app_label='document_parsing', model_name='DocumentVersionPageContent' ) - for page in document.pages.all(): + for document_page in document.pages.all(): try: - page_content = page.content.content - except DocumentPageContent.DoesNotExist: + page_content = document_page.content_object.content.content + except DocumentVersionPageContent.DoesNotExist: + pass + else: + yield conditional_escape(force_text(page_content)) + + +def get_document_version_content(document_version): + DocumentVersionPageContent = apps.get_model( + app_label='document_parsing', model_name='DocumentVersionPageContent' + ) + + for document_version_page in document_version.pages.all(): + try: + page_content = document_version_page.content.content + except DocumentVersionPageContent.DoesNotExist: pass else: yield conditional_escape(force_text(page_content)) diff --git a/mayan/apps/documents/apps.py b/mayan/apps/documents/apps.py index 4faae2f8bd..541134fb4b 100644 --- a/mayan/apps/documents/apps.py +++ b/mayan/apps/documents/apps.py @@ -235,7 +235,7 @@ class DocumentsApp(MayanAppConfig): model=DocumentPage, manager_name='passthrough' ) ModelPermission.register_inheritance( - model=DocumentPageResult, related='document_version__document', + model=DocumentPageResult, related='document', ) ModelPermission.register_manager( model=DocumentPageResult, manager_name='passthrough' diff --git a/mayan/apps/ocr/apps.py b/mayan/apps/ocr/apps.py index 1d0410bd44..772ec287fa 100644 --- a/mayan/apps/ocr/apps.py +++ b/mayan/apps/ocr/apps.py @@ -66,7 +66,7 @@ class OCRApp(MayanAppConfig): app_label='documents', model_name='Document' ) DocumentPage = apps.get_model( - app_label='documents', model_name='DocumentVersionPage' + app_label='documents', model_name='DocumentPage' ) DocumentType = apps.get_model( app_label='documents', model_name='DocumentType'