Fix parsing tests
Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
This commit is contained in:
@@ -45,7 +45,7 @@ from .permissions import (
|
|||||||
permission_parse_document
|
permission_parse_document
|
||||||
)
|
)
|
||||||
from .signals import post_document_version_parsing
|
from .signals import post_document_version_parsing
|
||||||
from .utils import get_document_content
|
from .utils import get_document_content, get_document_version_content
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
@@ -65,7 +65,7 @@ class DocumentParsingApp(MayanAppConfig):
|
|||||||
app_label='documents', model_name='Document'
|
app_label='documents', model_name='Document'
|
||||||
)
|
)
|
||||||
DocumentPage = apps.get_model(
|
DocumentPage = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentVersionPage'
|
app_label='documents', model_name='DocumentPage'
|
||||||
)
|
)
|
||||||
DocumentType = apps.get_model(
|
DocumentType = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentType'
|
app_label='documents', model_name='DocumentType'
|
||||||
@@ -76,6 +76,9 @@ class DocumentParsingApp(MayanAppConfig):
|
|||||||
DocumentVersion = apps.get_model(
|
DocumentVersion = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentVersion'
|
app_label='documents', model_name='DocumentVersion'
|
||||||
)
|
)
|
||||||
|
DocumentVersionPage = apps.get_model(
|
||||||
|
app_label='documents', model_name='DocumentVersionPage'
|
||||||
|
)
|
||||||
DocumentVersionParseError = self.get_model(
|
DocumentVersionParseError = self.get_model(
|
||||||
model_name='DocumentVersionParseError'
|
model_name='DocumentVersionParseError'
|
||||||
)
|
)
|
||||||
@@ -87,7 +90,7 @@ class DocumentParsingApp(MayanAppConfig):
|
|||||||
name='content', value=get_document_content
|
name='content', value=get_document_content
|
||||||
)
|
)
|
||||||
DocumentVersion.add_to_class(
|
DocumentVersion.add_to_class(
|
||||||
name='content', value=get_document_content
|
name='content', value=get_document_version_content
|
||||||
)
|
)
|
||||||
DocumentVersion.add_to_class(
|
DocumentVersion.add_to_class(
|
||||||
name='submit_for_parsing',
|
name='submit_for_parsing',
|
||||||
|
|||||||
@@ -26,9 +26,9 @@ class DocumentContentForm(forms.Form):
|
|||||||
except AttributeError:
|
except AttributeError:
|
||||||
document_pages = []
|
document_pages = []
|
||||||
|
|
||||||
for page in document_pages:
|
for document_page in document_pages:
|
||||||
try:
|
try:
|
||||||
page_content = page.content.content
|
page_content = document_page.content_object.content.content
|
||||||
except DocumentVersionPageContent.DoesNotExist:
|
except DocumentVersionPageContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@@ -37,7 +37,7 @@ class DocumentContentForm(forms.Form):
|
|||||||
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
||||||
ugettext(
|
ugettext(
|
||||||
'Page %(page_number)d'
|
'Page %(page_number)d'
|
||||||
) % {'page_number': page.page_number}
|
) % {'page_number': document_page.page_number}
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -72,7 +72,7 @@ class DocumentPageContentForm(forms.Form):
|
|||||||
self.fields['contents'].initial = ''
|
self.fields['contents'].initial = ''
|
||||||
|
|
||||||
try:
|
try:
|
||||||
page_content = document_page.content.content
|
page_content = document_page.content_object.content.content
|
||||||
except DocumentVersionPageContent.DoesNotExist:
|
except DocumentVersionPageContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -22,7 +22,9 @@ class DocumentPageContentManager(models.Manager):
|
|||||||
def delete_content_for(self, document, user=None):
|
def delete_content_for(self, document, user=None):
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
for document_page in document.pages.all():
|
for document_page in document.pages.all():
|
||||||
self.filter(document_page=document_page).delete()
|
self.filter(
|
||||||
|
document_version_page=document_page.content_object
|
||||||
|
).delete()
|
||||||
|
|
||||||
event_parsing_document_content_deleted.commit(
|
event_parsing_document_content_deleted.commit(
|
||||||
actor=user, target=document
|
actor=user, target=document
|
||||||
|
|||||||
@@ -11,32 +11,6 @@ from mayan.apps.documents.models import (
|
|||||||
from .managers import DocumentPageContentManager, DocumentTypeSettingsManager
|
from .managers import DocumentPageContentManager, DocumentTypeSettingsManager
|
||||||
|
|
||||||
|
|
||||||
@python_2_unicode_compatible
|
|
||||||
class DocumentVersionPageContent(models.Model):
|
|
||||||
"""
|
|
||||||
This model store's the parsed content of a document page.
|
|
||||||
"""
|
|
||||||
document_version_page = models.OneToOneField(
|
|
||||||
on_delete=models.CASCADE, related_name='content',
|
|
||||||
to=DocumentVersionPage, verbose_name=_('Document version page')
|
|
||||||
)
|
|
||||||
content = models.TextField(
|
|
||||||
blank=True, help_text=_(
|
|
||||||
'The actual text content as extracted by the document '
|
|
||||||
'parsing backend.'
|
|
||||||
), verbose_name=_('Content')
|
|
||||||
)
|
|
||||||
|
|
||||||
objects = DocumentPageContentManager()
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
verbose_name = _('Document version page content')
|
|
||||||
verbose_name_plural = _('Document version pages contents')
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return force_text(self.document_page)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentTypeSettings(models.Model):
|
class DocumentTypeSettings(models.Model):
|
||||||
"""
|
"""
|
||||||
This model stores the parsing settings for a document type.
|
This model stores the parsing settings for a document type.
|
||||||
@@ -62,6 +36,32 @@ class DocumentTypeSettings(models.Model):
|
|||||||
verbose_name_plural = _('Document types settings')
|
verbose_name_plural = _('Document types settings')
|
||||||
|
|
||||||
|
|
||||||
|
@python_2_unicode_compatible
|
||||||
|
class DocumentVersionPageContent(models.Model):
|
||||||
|
"""
|
||||||
|
This model store's the parsed content of a document page.
|
||||||
|
"""
|
||||||
|
document_version_page = models.OneToOneField(
|
||||||
|
on_delete=models.CASCADE, related_name='content',
|
||||||
|
to=DocumentVersionPage, verbose_name=_('Document version page')
|
||||||
|
)
|
||||||
|
content = models.TextField(
|
||||||
|
blank=True, help_text=_(
|
||||||
|
'The actual text content as extracted by the document '
|
||||||
|
'parsing backend.'
|
||||||
|
), verbose_name=_('Content')
|
||||||
|
)
|
||||||
|
|
||||||
|
objects = DocumentPageContentManager()
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = _('Document version page content')
|
||||||
|
verbose_name_plural = _('Document version pages contents')
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return force_text(self.document_page)
|
||||||
|
|
||||||
|
|
||||||
@python_2_unicode_compatible
|
@python_2_unicode_compatible
|
||||||
class DocumentVersionParseError(models.Model):
|
class DocumentVersionParseError(models.Model):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -23,11 +23,13 @@ class Parser(object):
|
|||||||
_registry = {}
|
_registry = {}
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def parse_document_page(cls, document_page):
|
def parse_document_version_page(cls, document_version_page):
|
||||||
for parser_class in cls._registry.get(document_page.document_version.mimetype, ()):
|
for parser_class in cls._registry.get(document_version_page.document_version.mimetype, ()):
|
||||||
try:
|
try:
|
||||||
parser = parser_class()
|
parser = parser_class()
|
||||||
parser.process_document_page(document_page)
|
parser.process_document_page(
|
||||||
|
document_version_page=document_version_page
|
||||||
|
)
|
||||||
except ParserError:
|
except ParserError:
|
||||||
# If parser raises error, try next parser in the list
|
# If parser raises error, try next parser in the list
|
||||||
pass
|
pass
|
||||||
@@ -41,7 +43,9 @@ class Parser(object):
|
|||||||
for parser_class in cls._registry.get(document_version.mimetype, ()):
|
for parser_class in cls._registry.get(document_version.mimetype, ()):
|
||||||
try:
|
try:
|
||||||
parser = parser_class()
|
parser = parser_class()
|
||||||
parser.process_document_version(document_version)
|
parser.process_document_version(
|
||||||
|
document_version=document_version
|
||||||
|
)
|
||||||
except ParserError:
|
except ParserError:
|
||||||
# If parser raises error, try next parser in the list
|
# If parser raises error, try next parser in the list
|
||||||
pass
|
pass
|
||||||
@@ -64,10 +68,12 @@ class Parser(object):
|
|||||||
)
|
)
|
||||||
logger.debug('document version: %d', document_version.pk)
|
logger.debug('document version: %d', document_version.pk)
|
||||||
|
|
||||||
for document_page in document_version.pages.all():
|
for document_version_page in document_version.pages.all():
|
||||||
self.process_document_page(document_page=document_page)
|
self.process_document_version_page(
|
||||||
|
document_version_page=document_version_page
|
||||||
|
)
|
||||||
|
|
||||||
def process_document_page(self, document_page):
|
def process_document_version_page(self, document_version_page):
|
||||||
DocumentVersionPageContent = apps.get_model(
|
DocumentVersionPageContent = apps.get_model(
|
||||||
app_label='document_parsing',
|
app_label='document_parsing',
|
||||||
model_name='DocumentVersionPageContent'
|
model_name='DocumentVersionPageContent'
|
||||||
@@ -75,19 +81,20 @@ class Parser(object):
|
|||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
'Processing page: %d of document version: %s',
|
'Processing page: %d of document version: %s',
|
||||||
document_page.page_number, document_page.document_version
|
document_version_page.page_number,
|
||||||
|
document_version_page.document_version
|
||||||
)
|
)
|
||||||
|
|
||||||
file_object = document_page.document_version.get_intermediate_file()
|
file_object = document_version_page.document_version.get_intermediate_file()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
document_page_content, created = DocumentVersionPageContent.objects.get_or_create(
|
document_version_page_content, created = DocumentVersionPageContent.objects.get_or_create(
|
||||||
document_page=document_page
|
document_version_page=document_version_page
|
||||||
)
|
)
|
||||||
document_page_content.content = self.execute(
|
document_version_page_content.content = self.execute(
|
||||||
file_object=file_object, page_number=document_page.page_number
|
file_object=file_object, page_number=document_version_page.page_number
|
||||||
)
|
)
|
||||||
document_page_content.save()
|
document_version_page_content.save()
|
||||||
except Exception as exception:
|
except Exception as exception:
|
||||||
error_message = _('Exception parsing page; %s') % exception
|
error_message = _('Exception parsing page; %s') % exception
|
||||||
logger.error(error_message)
|
logger.error(error_message)
|
||||||
@@ -97,7 +104,8 @@ class Parser(object):
|
|||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
'Finished processing page: %d of document version: %s',
|
'Finished processing page: %d of document version: %s',
|
||||||
document_page.page_number, document_page.document_version
|
document_version_page.page_number,
|
||||||
|
document_version_page.document_version
|
||||||
)
|
)
|
||||||
|
|
||||||
def execute(self, file_object, page_number):
|
def execute(self, file_object, page_number):
|
||||||
|
|||||||
@@ -14,8 +14,8 @@ def task_parse_document_version(document_version_pk):
|
|||||||
DocumentVersion = apps.get_model(
|
DocumentVersion = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentVersion'
|
app_label='documents', model_name='DocumentVersion'
|
||||||
)
|
)
|
||||||
DocumentPageContent = apps.get_model(
|
DocumentVersionPageContent = apps.get_model(
|
||||||
app_label='document_parsing', model_name='DocumentPageContent'
|
app_label='document_parsing', model_name='DocumentVersionPageContent'
|
||||||
)
|
)
|
||||||
|
|
||||||
document_version = DocumentVersion.objects.get(
|
document_version = DocumentVersion.objects.get(
|
||||||
@@ -24,6 +24,6 @@ def task_parse_document_version(document_version_pk):
|
|||||||
logger.info(
|
logger.info(
|
||||||
'Starting parsing for document version: %s', document_version
|
'Starting parsing for document version: %s', document_version
|
||||||
)
|
)
|
||||||
DocumentPageContent.objects.process_document_version(
|
DocumentVersionPageContent.objects.process_document_version(
|
||||||
document_version=document_version
|
document_version=document_version
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ from ..events import (
|
|||||||
event_parsing_document_version_submit,
|
event_parsing_document_version_submit,
|
||||||
event_parsing_document_version_finish
|
event_parsing_document_version_finish
|
||||||
)
|
)
|
||||||
from ..models import DocumentPageContent
|
from ..models import DocumentVersionPageContent
|
||||||
|
|
||||||
|
|
||||||
class DocumentParsingEventsTestCase(GenericDocumentTestCase):
|
class DocumentParsingEventsTestCase(GenericDocumentTestCase):
|
||||||
@@ -19,7 +19,7 @@ class DocumentParsingEventsTestCase(GenericDocumentTestCase):
|
|||||||
|
|
||||||
def test_document_content_deleted_event(self):
|
def test_document_content_deleted_event(self):
|
||||||
Action.objects.all().delete()
|
Action.objects.all().delete()
|
||||||
DocumentPageContent.objects.delete_content_for(
|
DocumentVersionPageContent.objects.delete_content_for(
|
||||||
document=self.test_document
|
document=self.test_document
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -18,5 +18,5 @@ class ParserTestCase(DocumentTestMixin, BaseTestCase):
|
|||||||
parser.process_document_version(self.test_document.latest_version)
|
parser.process_document_version(self.test_document.latest_version)
|
||||||
|
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
TEST_DOCUMENT_CONTENT in self.test_document.pages.first().content.content
|
TEST_DOCUMENT_CONTENT in self.test_document.pages.first().content_object.content.content
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ from django.test import override_settings
|
|||||||
from mayan.apps.documents.tests.base import GenericDocumentViewTestCase
|
from mayan.apps.documents.tests.base import GenericDocumentViewTestCase
|
||||||
from mayan.apps.documents.tests.literals import TEST_HYBRID_DOCUMENT
|
from mayan.apps.documents.tests.literals import TEST_HYBRID_DOCUMENT
|
||||||
|
|
||||||
from ..models import DocumentPageContent
|
from ..models import DocumentVersionPageContent
|
||||||
from ..permissions import (
|
from ..permissions import (
|
||||||
permission_content_view, permission_document_type_parsing_setup,
|
permission_content_view, permission_document_type_parsing_setup,
|
||||||
permission_parse_document
|
permission_parse_document
|
||||||
@@ -72,8 +72,8 @@ class DocumentContentViewsTestCase(
|
|||||||
self.assertEqual(response.status_code, 404)
|
self.assertEqual(response.status_code, 404)
|
||||||
|
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
DocumentPageContent.objects.filter(
|
DocumentVersionPageContent.objects.filter(
|
||||||
document_page=self.test_document.pages.first()
|
document_version_page=self.test_document.pages.first().content_object
|
||||||
).exists()
|
).exists()
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -86,8 +86,8 @@ class DocumentContentViewsTestCase(
|
|||||||
self.assertEqual(response.status_code, 302)
|
self.assertEqual(response.status_code, 302)
|
||||||
|
|
||||||
self.assertFalse(
|
self.assertFalse(
|
||||||
DocumentPageContent.objects.filter(
|
DocumentVersionPageContent.objects.filter(
|
||||||
document_page=self.test_document.pages.first()
|
document_version_page=self.test_document.pages.first().content_object
|
||||||
).exists()
|
).exists()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -6,14 +6,28 @@ from django.utils.html import conditional_escape
|
|||||||
|
|
||||||
|
|
||||||
def get_document_content(document):
|
def get_document_content(document):
|
||||||
DocumentPageContent = apps.get_model(
|
DocumentVersionPageContent = apps.get_model(
|
||||||
app_label='document_parsing', model_name='DocumentPageContent'
|
app_label='document_parsing', model_name='DocumentVersionPageContent'
|
||||||
)
|
)
|
||||||
|
|
||||||
for page in document.pages.all():
|
for document_page in document.pages.all():
|
||||||
try:
|
try:
|
||||||
page_content = page.content.content
|
page_content = document_page.content_object.content.content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentVersionPageContent.DoesNotExist:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
yield conditional_escape(force_text(page_content))
|
||||||
|
|
||||||
|
|
||||||
|
def get_document_version_content(document_version):
|
||||||
|
DocumentVersionPageContent = apps.get_model(
|
||||||
|
app_label='document_parsing', model_name='DocumentVersionPageContent'
|
||||||
|
)
|
||||||
|
|
||||||
|
for document_version_page in document_version.pages.all():
|
||||||
|
try:
|
||||||
|
page_content = document_version_page.content.content
|
||||||
|
except DocumentVersionPageContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
yield conditional_escape(force_text(page_content))
|
yield conditional_escape(force_text(page_content))
|
||||||
|
|||||||
@@ -235,7 +235,7 @@ class DocumentsApp(MayanAppConfig):
|
|||||||
model=DocumentPage, manager_name='passthrough'
|
model=DocumentPage, manager_name='passthrough'
|
||||||
)
|
)
|
||||||
ModelPermission.register_inheritance(
|
ModelPermission.register_inheritance(
|
||||||
model=DocumentPageResult, related='document_version__document',
|
model=DocumentPageResult, related='document',
|
||||||
)
|
)
|
||||||
ModelPermission.register_manager(
|
ModelPermission.register_manager(
|
||||||
model=DocumentPageResult, manager_name='passthrough'
|
model=DocumentPageResult, manager_name='passthrough'
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ class OCRApp(MayanAppConfig):
|
|||||||
app_label='documents', model_name='Document'
|
app_label='documents', model_name='Document'
|
||||||
)
|
)
|
||||||
DocumentPage = apps.get_model(
|
DocumentPage = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentVersionPage'
|
app_label='documents', model_name='DocumentPage'
|
||||||
)
|
)
|
||||||
DocumentType = apps.get_model(
|
DocumentType = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentType'
|
app_label='documents', model_name='DocumentType'
|
||||||
|
|||||||
Reference in New Issue
Block a user