Finish the document parsing app.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
|
|
||||||
from .models import (
|
from .models import (
|
||||||
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
|
DocumentPageContent, DocumentVersionParseError
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -12,12 +12,7 @@ class DocumentPageContentAdmin(admin.ModelAdmin):
|
|||||||
list_display = ('document_page',)
|
list_display = ('document_page',)
|
||||||
|
|
||||||
|
|
||||||
@admin.register(DocumentTypeSettings)
|
@admin.register(DocumentVersionParseError)
|
||||||
class DocumentTypeSettingsAdmin(admin.ModelAdmin):
|
class DocumentVersionParseErrorAdmin(admin.ModelAdmin):
|
||||||
list_display = ('document_type', 'auto_ocr')
|
|
||||||
|
|
||||||
|
|
||||||
@admin.register(DocumentVersionOCRError)
|
|
||||||
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
|
|
||||||
list_display = ('document_version', 'datetime_submitted')
|
list_display = ('document_version', 'datetime_submitted')
|
||||||
readonly_fields = ('document_version', 'datetime_submitted', 'result')
|
readonly_fields = ('document_version', 'datetime_submitted', 'result')
|
||||||
|
|||||||
@@ -1,75 +1,19 @@
|
|||||||
from __future__ import absolute_import, unicode_literals
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
from rest_framework import generics, status
|
from rest_framework import generics
|
||||||
from rest_framework.response import Response
|
from rest_framework.response import Response
|
||||||
|
|
||||||
from documents.models import Document, DocumentPage, DocumentVersion
|
from documents.models import DocumentPage
|
||||||
from rest_api.permissions import MayanPermission
|
from rest_api.permissions import MayanPermission
|
||||||
|
|
||||||
from .models import DocumentPageContent
|
from .models import DocumentPageContent
|
||||||
from .permissions import permission_ocr_content_view, permission_ocr_document
|
from .permissions import permission_content_view
|
||||||
from .serializers import DocumentPageContentSerializer
|
from .serializers import DocumentPageContentSerializer
|
||||||
|
|
||||||
|
|
||||||
class APIDocumentOCRView(generics.GenericAPIView):
|
|
||||||
mayan_object_permissions = {
|
|
||||||
'POST': (permission_ocr_document,)
|
|
||||||
}
|
|
||||||
permission_classes = (MayanPermission,)
|
|
||||||
queryset = Document.objects.all()
|
|
||||||
|
|
||||||
def get_serializer_class(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def post(self, request, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
Submit a document for OCR.
|
|
||||||
---
|
|
||||||
omit_serializer: true
|
|
||||||
parameters:
|
|
||||||
- name: pk
|
|
||||||
paramType: path
|
|
||||||
type: number
|
|
||||||
responseMessages:
|
|
||||||
- code: 202
|
|
||||||
message: Accepted
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.get_object().submit_for_ocr()
|
|
||||||
return Response(status=status.HTTP_202_ACCEPTED)
|
|
||||||
|
|
||||||
|
|
||||||
class APIDocumentVersionOCRView(generics.GenericAPIView):
|
|
||||||
mayan_object_permissions = {
|
|
||||||
'POST': (permission_ocr_document,)
|
|
||||||
}
|
|
||||||
permission_classes = (MayanPermission,)
|
|
||||||
queryset = DocumentVersion.objects.all()
|
|
||||||
|
|
||||||
def get_serializer_class(self):
|
|
||||||
return None
|
|
||||||
|
|
||||||
def post(self, request, *args, **kwargs):
|
|
||||||
"""
|
|
||||||
Submit a document version for OCR.
|
|
||||||
---
|
|
||||||
omit_serializer: true
|
|
||||||
parameters:
|
|
||||||
- name: pk
|
|
||||||
paramType: path
|
|
||||||
type: number
|
|
||||||
responseMessages:
|
|
||||||
- code: 202
|
|
||||||
message: Accepted
|
|
||||||
"""
|
|
||||||
|
|
||||||
self.get_object().submit_for_ocr()
|
|
||||||
return Response(status=status.HTTP_202_ACCEPTED)
|
|
||||||
|
|
||||||
|
|
||||||
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||||
"""
|
"""
|
||||||
Returns the OCR content of the selected document page.
|
Returns the content of the selected document page.
|
||||||
---
|
---
|
||||||
GET:
|
GET:
|
||||||
parameters:
|
parameters:
|
||||||
@@ -79,7 +23,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
mayan_object_permissions = {
|
mayan_object_permissions = {
|
||||||
'GET': (permission_ocr_content_view,),
|
'GET': (permission_content_view,),
|
||||||
}
|
}
|
||||||
permission_classes = (MayanPermission,)
|
permission_classes = (MayanPermission,)
|
||||||
serializer_class = DocumentPageContentSerializer
|
serializer_class = DocumentPageContentSerializer
|
||||||
@@ -89,9 +33,9 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
|||||||
instance = self.get_object()
|
instance = self.get_object()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
ocr_content = instance.ocr_content
|
content = instance.content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentPageContent.DoesNotExist:
|
||||||
ocr_content = DocumentPageContent.objects.none()
|
content = DocumentPageContent.objects.none()
|
||||||
|
|
||||||
serializer = self.get_serializer(ocr_content)
|
serializer = self.get_serializer(content)
|
||||||
return Response(serializer.data)
|
return Response(serializer.data)
|
||||||
|
|||||||
@@ -1,11 +1,12 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from kombu import Exchange, Queue
|
from kombu import Exchange, Queue
|
||||||
|
|
||||||
from django.apps import apps
|
from django.apps import apps
|
||||||
from django.db.models.signals import post_save
|
from django.utils.timezone import now
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
from acls import ModelPermission
|
from acls import ModelPermission
|
||||||
@@ -21,16 +22,38 @@ from mayan.celery import app
|
|||||||
from navigation import SourceColumn
|
from navigation import SourceColumn
|
||||||
from rest_api.classes import APIEndPoint
|
from rest_api.classes import APIEndPoint
|
||||||
|
|
||||||
|
from .events import event_parsing_document_version_submit
|
||||||
from .handlers import handler_parse_document_version
|
from .handlers import handler_parse_document_version
|
||||||
from .links import (
|
from .links import (
|
||||||
link_document_content, link_entry_list, link_document_content_errors_list,
|
link_document_content, link_document_content_download,
|
||||||
link_document_content_download
|
link_document_parsing_errors_list, link_document_submit_multiple,
|
||||||
|
link_document_submit, link_document_type_submit, link_error_list
|
||||||
)
|
)
|
||||||
from .permissions import permission_content_view
|
from .permissions import permission_content_view
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def document_parsing_submit(self):
|
||||||
|
latest_version = self.latest_version
|
||||||
|
# Don't error out if document has no version
|
||||||
|
if latest_version:
|
||||||
|
latest_version.submit_for_parsing()
|
||||||
|
|
||||||
|
|
||||||
|
def document_version_parsing_submit(self):
|
||||||
|
from .tasks import task_parse_document_version
|
||||||
|
|
||||||
|
event_parsing_document_version_submit.commit(
|
||||||
|
action_object=self.document, target=self
|
||||||
|
)
|
||||||
|
|
||||||
|
task_parse_document_version.apply_async(
|
||||||
|
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
|
||||||
|
kwargs={'document_version_pk': self.pk},
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DocumentParsingApp(MayanAppConfig):
|
class DocumentParsingApp(MayanAppConfig):
|
||||||
has_tests = True
|
has_tests = True
|
||||||
name = 'document_parsing'
|
name = 'document_parsing'
|
||||||
@@ -45,16 +68,17 @@ class DocumentParsingApp(MayanAppConfig):
|
|||||||
app_label='documents', model_name='Document'
|
app_label='documents', model_name='Document'
|
||||||
)
|
)
|
||||||
|
|
||||||
DocumentType = apps.get_model(
|
|
||||||
app_label='documents', model_name='DocumentType'
|
|
||||||
)
|
|
||||||
|
|
||||||
DocumentVersion = apps.get_model(
|
DocumentVersion = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentVersion'
|
app_label='documents', model_name='DocumentVersion'
|
||||||
)
|
)
|
||||||
|
|
||||||
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
|
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
|
||||||
|
|
||||||
|
Document.add_to_class('submit_for_parsing', document_parsing_submit)
|
||||||
|
DocumentVersion.add_to_class(
|
||||||
|
'submit_for_parsing', document_version_parsing_submit
|
||||||
|
)
|
||||||
|
|
||||||
ModelPermission.register(
|
ModelPermission.register(
|
||||||
model=Document, permissions=(permission_content_view,)
|
model=Document, permissions=(permission_content_view,)
|
||||||
)
|
)
|
||||||
@@ -72,6 +96,18 @@ class DocumentParsingApp(MayanAppConfig):
|
|||||||
attribute='result'
|
attribute='result'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
app.conf.CELERY_QUEUES.append(
|
||||||
|
Queue('parsing', Exchange('parsing'), routing_key='parsing'),
|
||||||
|
)
|
||||||
|
|
||||||
|
app.conf.CELERY_ROUTES.update(
|
||||||
|
{
|
||||||
|
'document_parsing.tasks.task_parse_document_version': {
|
||||||
|
'queue': 'parsing'
|
||||||
|
},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
document_search.add_model_field(
|
document_search.add_model_field(
|
||||||
field='versions__pages__content__content', label=_('Content')
|
field='versions__pages__content__content', label=_('Content')
|
||||||
)
|
)
|
||||||
@@ -89,32 +125,20 @@ class DocumentParsingApp(MayanAppConfig):
|
|||||||
menu_object.bind_links(
|
menu_object.bind_links(
|
||||||
links=(link_document_submit,), sources=(Document,)
|
links=(link_document_submit,), sources=(Document,)
|
||||||
)
|
)
|
||||||
menu_object.bind_links(
|
|
||||||
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
|
|
||||||
)
|
|
||||||
menu_secondary.bind_links(
|
menu_secondary.bind_links(
|
||||||
links=(
|
links=(
|
||||||
link_document_content, link_document_ocr_erros_list,
|
link_document_content, link_document_parsing_errors_list,
|
||||||
link_document_ocr_download
|
link_document_content_download
|
||||||
),
|
),
|
||||||
sources=(
|
sources=(
|
||||||
'document_parsing:document_content',
|
'document_parsing:document_content',
|
||||||
'document_parsing:document_ocr_error_list',
|
'document_parsing:document_content_download',
|
||||||
'document_parsing:document_ocr_download',
|
'document_parsing:document_parsing_error_list',
|
||||||
)
|
|
||||||
)
|
|
||||||
menu_secondary.bind_links(
|
|
||||||
links=(link_entry_list,),
|
|
||||||
sources=(
|
|
||||||
'document_parsing:entry_list',
|
|
||||||
'document_parsing:entry_delete_multiple',
|
|
||||||
'document_parsing:entry_re_queue_multiple',
|
|
||||||
DocumentVersionParseError
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
menu_tools.bind_links(
|
menu_tools.bind_links(
|
||||||
links=(
|
links=(
|
||||||
link_entry_list
|
link_document_type_submit, link_error_list,
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
14
mayan/apps/document_parsing/events.py
Normal file
14
mayan/apps/document_parsing/events.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from events.classes import Event
|
||||||
|
|
||||||
|
event_parsing_document_version_submit = Event(
|
||||||
|
name='parsing_document_version_submit',
|
||||||
|
label=_('Document version submitted for parsing')
|
||||||
|
)
|
||||||
|
event_parsing_document_version_finish = Event(
|
||||||
|
name='parsing_document_version_finish',
|
||||||
|
label=_('Document version parsing finished')
|
||||||
|
)
|
||||||
@@ -1,13 +1,6 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
class OCRError(Exception):
|
|
||||||
"""
|
|
||||||
Raised by the OCR backend
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class ParserError(Exception):
|
class ParserError(Exception):
|
||||||
"""
|
"""
|
||||||
Base exception for file parsers
|
Base exception for file parsers
|
||||||
|
|||||||
@@ -6,10 +6,12 @@ from django.utils.html import conditional_escape
|
|||||||
from django.utils.safestring import mark_safe
|
from django.utils.safestring import mark_safe
|
||||||
from django.utils.translation import ugettext_lazy as _, ugettext
|
from django.utils.translation import ugettext_lazy as _, ugettext
|
||||||
|
|
||||||
|
from acls.models import AccessControlList
|
||||||
from common.widgets import TextAreaDiv
|
from common.widgets import TextAreaDiv
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
|
|
||||||
from .models import DocumentPageContent, DocumentPageOCRContent
|
from .models import DocumentPageContent
|
||||||
|
from .permissions import permission_parse_document
|
||||||
|
|
||||||
|
|
||||||
class DocumentContentForm(forms.Form):
|
class DocumentContentForm(forms.Form):
|
||||||
@@ -29,7 +31,7 @@ class DocumentContentForm(forms.Form):
|
|||||||
|
|
||||||
for page in document_pages:
|
for page in document_pages:
|
||||||
try:
|
try:
|
||||||
page_content = page.ocr_content.content
|
page_content = page.content.content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentPageContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
@@ -55,50 +57,16 @@ class DocumentContentForm(forms.Form):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class DocumentOCRContentForm(forms.Form):
|
|
||||||
"""
|
|
||||||
Form that concatenates all of a document pages' text content into a
|
|
||||||
single textarea widget
|
|
||||||
"""
|
|
||||||
def __init__(self, *args, **kwargs):
|
|
||||||
self.document = kwargs.pop('instance', None)
|
|
||||||
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
|
||||||
content = []
|
|
||||||
self.fields['contents'].initial = ''
|
|
||||||
try:
|
|
||||||
document_pages = self.document.pages.all()
|
|
||||||
except AttributeError:
|
|
||||||
document_pages = []
|
|
||||||
|
|
||||||
for page in document_pages:
|
|
||||||
try:
|
|
||||||
page_content = page.ocr_content.content
|
|
||||||
except DocumentPageOCRContent.DoesNotExist:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
content.append(conditional_escape(force_text(page_content)))
|
|
||||||
content.append(
|
|
||||||
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
|
||||||
ugettext(
|
|
||||||
'Page %(page_number)d'
|
|
||||||
) % {'page_number': page.page_number}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.fields['contents'].initial = mark_safe(''.join(content))
|
|
||||||
|
|
||||||
contents = forms.CharField(
|
|
||||||
label=_('Contents'),
|
|
||||||
widget=TextAreaDiv(
|
|
||||||
attrs={
|
|
||||||
'class': 'text_area_div full-height',
|
|
||||||
'data-height-difference': 360
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentTypeSelectForm(forms.Form):
|
class DocumentTypeSelectForm(forms.Form):
|
||||||
document_type = forms.ModelChoiceField(
|
document_type = forms.ModelChoiceField(
|
||||||
queryset=DocumentType.objects.all(), label=('Document type')
|
queryset=DocumentType.objects.none(), label=('Document type')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
user = kwargs.pop('user')
|
||||||
|
super(DocumentTypeSelectForm, self).__init__(*args, **kwargs)
|
||||||
|
queryset = AccessControlList.objects.filter_by_access(
|
||||||
|
permission=permission_parse_document,
|
||||||
|
queryset=DocumentType.objects.all(), user=user,
|
||||||
|
)
|
||||||
|
self.fields['document_type'].queryset = queryset
|
||||||
|
|||||||
@@ -2,14 +2,8 @@ from __future__ import unicode_literals
|
|||||||
|
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from django.apps import apps
|
|
||||||
|
|
||||||
from .settings import setting_auto_ocr
|
|
||||||
from .parsers import Parser
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def handler_parse_document_version(sender, instance, **kwargs):
|
def handler_parse_document_version(sender, instance, **kwargs):
|
||||||
if kwargs['created']:
|
instance.submit_for_parsing()
|
||||||
Parser.parse_document_version(document_version=instance)
|
|
||||||
|
|||||||
@@ -4,24 +4,36 @@ from django.utils.translation import ugettext_lazy as _
|
|||||||
|
|
||||||
from navigation import Link
|
from navigation import Link
|
||||||
|
|
||||||
from .permissions import permission_content_view
|
from .permissions import permission_content_view, permission_parse_document
|
||||||
|
|
||||||
link_document_content = Link(
|
link_document_content = Link(
|
||||||
args='resolved_object.id', icon='fa fa-font',
|
args='resolved_object.id', icon='fa fa-font',
|
||||||
permissions=(permission_content_view,), text=_('Content'),
|
permissions=(permission_content_view,), text=_('Content'),
|
||||||
view='document_parsing:document_content',
|
view='document_parsing:document_content',
|
||||||
)
|
)
|
||||||
link_entry_list = Link(
|
link_document_parsing_errors_list = Link(
|
||||||
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
|
|
||||||
text=_('Parsing errors'), view='document_parsing:entry_list'
|
|
||||||
)
|
|
||||||
link_document_content_errors_list = Link(
|
|
||||||
args='resolved_object.id', icon='fa fa-file-text-o',
|
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||||
permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
|
permissions=(permission_content_view,), text=_('Parsing errors'),
|
||||||
view='document_parsing:document_page_parsing_error_list'
|
view='document_parsing:document_parsing_error_list'
|
||||||
)
|
)
|
||||||
link_document_content_download = Link(
|
link_document_content_download = Link(
|
||||||
args='resolved_object.id', icon='fa fa-file-text-o',
|
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||||
permissions=(permission_ocr_content_view,), text=_('Download content'),
|
permissions=(permission_content_view,), text=_('Download content'),
|
||||||
view='document_parsing:document_content_download'
|
view='document_parsing:document_content_download'
|
||||||
)
|
)
|
||||||
|
link_document_submit_multiple = Link(
|
||||||
|
text=_('Submit for parsing'),
|
||||||
|
view='document_parsing:document_submit_multiple'
|
||||||
|
)
|
||||||
|
link_document_submit = Link(
|
||||||
|
args='resolved_object.id', permissions=(permission_parse_document,),
|
||||||
|
text=_('Submit for parsing'), view='document_parsing:document_submit'
|
||||||
|
)
|
||||||
|
link_document_type_submit = Link(
|
||||||
|
icon='fa fa-crosshairs', text=_('Parse documents per type'),
|
||||||
|
view='document_parsing:document_type_submit'
|
||||||
|
)
|
||||||
|
link_error_list = Link(
|
||||||
|
icon='fa fa-file-text-o', permissions=(permission_content_view,),
|
||||||
|
text=_('Parsing errors'), view='document_parsing:error_list'
|
||||||
|
)
|
||||||
|
|||||||
@@ -1,14 +1,50 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from datetime import timedelta
|
|
||||||
import logging
|
import logging
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
from django.apps import apps
|
from django.conf import settings
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.timezone import now
|
|
||||||
|
from .events import event_parsing_document_version_finish
|
||||||
|
from .parsers import Parser
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
class DocumentPageContentManager(models.Manager):
|
class DocumentPageContentManager(models.Manager):
|
||||||
pass
|
def process_document_version(self, document_version):
|
||||||
|
logger.info(
|
||||||
|
'Starting parsing for document version: %s', document_version
|
||||||
|
)
|
||||||
|
logger.debug('document version: %d', document_version.pk)
|
||||||
|
|
||||||
|
try:
|
||||||
|
Parser.parse_document_version(document_version=document_version)
|
||||||
|
except Exception as exception:
|
||||||
|
logger.exception(
|
||||||
|
'Parsing error for document version: %d; %s',
|
||||||
|
document_version.pk, exception,
|
||||||
|
)
|
||||||
|
|
||||||
|
if settings.DEBUG:
|
||||||
|
result = []
|
||||||
|
type, value, tb = sys.exc_info()
|
||||||
|
result.append('%s: %s' % (type.__name__, value))
|
||||||
|
result.extend(traceback.format_tb(tb))
|
||||||
|
document_version.parsing_errors.create(
|
||||||
|
result='\n'.join(result)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
document_version.parsing_errors.create(result=exception)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
'Parsing complete for document version: %s', document_version
|
||||||
|
)
|
||||||
|
document_version.parsing_errors.all().delete()
|
||||||
|
|
||||||
|
event_parsing_document_version_finish.commit(
|
||||||
|
action_object=document_version.document,
|
||||||
|
target=document_version
|
||||||
|
)
|
||||||
|
|||||||
44
mayan/apps/document_parsing/migrations/0001_initial.py
Normal file
44
mayan/apps/document_parsing/migrations/0001_initial.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.10.7 on 2017-08-23 18:55
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
initial = True
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0041_auto_20170823_1855'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='DocumentPageContent',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('content', models.TextField(blank=True, verbose_name='Content')),
|
||||||
|
('document_page', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='content', to='documents.DocumentPage', verbose_name='Document page')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': 'Document page content',
|
||||||
|
'verbose_name_plural': 'Document pages contents',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='DocumentVersionParseError',
|
||||||
|
fields=[
|
||||||
|
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||||
|
('datetime_submitted', models.DateTimeField(auto_now_add=True, db_index=True, verbose_name='Date time submitted')),
|
||||||
|
('result', models.TextField(blank=True, null=True, verbose_name='Result')),
|
||||||
|
('document_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='parse_errors', to='documents.DocumentVersion', verbose_name='Document version')),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'ordering': ('datetime_submitted',),
|
||||||
|
'verbose_name': 'Document version parse error',
|
||||||
|
'verbose_name_plural': 'Document version parse errors',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
]
|
||||||
0
mayan/apps/document_parsing/migrations/__init__.py
Normal file
0
mayan/apps/document_parsing/migrations/__init__.py
Normal file
@@ -4,7 +4,7 @@ from django.db import models
|
|||||||
from django.utils.encoding import force_text, python_2_unicode_compatible
|
from django.utils.encoding import force_text, python_2_unicode_compatible
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
from documents.models import DocumentPage, DocumentVersion
|
||||||
|
|
||||||
from .managers import DocumentPageContentManager
|
from .managers import DocumentPageContentManager
|
||||||
|
|
||||||
@@ -30,11 +30,11 @@ class DocumentPageContent(models.Model):
|
|||||||
@python_2_unicode_compatible
|
@python_2_unicode_compatible
|
||||||
class DocumentVersionParseError(models.Model):
|
class DocumentVersionParseError(models.Model):
|
||||||
document_version = models.ForeignKey(
|
document_version = models.ForeignKey(
|
||||||
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
|
DocumentVersion, on_delete=models.CASCADE,
|
||||||
verbose_name=_('Document version')
|
related_name='parsing_errors', verbose_name=_('Document version')
|
||||||
)
|
)
|
||||||
datetime_submitted = models.DateTimeField(
|
datetime_submitted = models.DateTimeField(
|
||||||
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
|
auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
|
||||||
)
|
)
|
||||||
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||||
|
|
||||||
|
|||||||
@@ -1,20 +1,15 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
from pdfminer.converter import TextConverter
|
|
||||||
from pdfminer.layout import LAParams
|
|
||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
from common.utils import copyfile, fs_cleanup, mkstemp
|
from common.utils import copyfile, fs_cleanup, mkstemp
|
||||||
|
|
||||||
from .exceptions import ParserError, NoMIMETypeMatch
|
from .exceptions import ParserError, NoMIMETypeMatch
|
||||||
from .models import DocumentPageContent
|
|
||||||
from .settings import setting_pdftotext_path
|
from .settings import setting_pdftotext_path
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
@@ -82,6 +77,10 @@ class Parser(object):
|
|||||||
self.process_document_page(document_page=document_page)
|
self.process_document_page(document_page=document_page)
|
||||||
|
|
||||||
def process_document_page(self, document_page):
|
def process_document_page(self, document_page):
|
||||||
|
DocumentPageContent = apps.get_model(
|
||||||
|
app_label='document_parsing', model_name='DocumentPageContent'
|
||||||
|
)
|
||||||
|
|
||||||
logger.info(
|
logger.info(
|
||||||
'Processing page: %d of document version: %s',
|
'Processing page: %d of document version: %s',
|
||||||
document_page.page_number, document_page.document_version
|
document_page.page_number, document_page.document_version
|
||||||
@@ -171,32 +170,7 @@ class PopplerParser(Parser):
|
|||||||
return output
|
return output
|
||||||
|
|
||||||
|
|
||||||
class PDFMinerParser(Parser):
|
|
||||||
"""
|
|
||||||
Parser for PDF files using the PDFMiner library for Python
|
|
||||||
"""
|
|
||||||
|
|
||||||
def execute(self, file_object, page_number):
|
|
||||||
logger.debug('Parsing PDF page: %d', page_number)
|
|
||||||
|
|
||||||
with BytesIO() as string_buffer:
|
|
||||||
rsrcmgr = PDFResourceManager()
|
|
||||||
device = TextConverter(
|
|
||||||
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
|
||||||
)
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
||||||
page = PDFPage.get_pages(
|
|
||||||
file_object, maxpages=1, pagenos=(page_number - 1,)
|
|
||||||
)
|
|
||||||
interpreter.process_page(page.next())
|
|
||||||
device.close()
|
|
||||||
|
|
||||||
logger.debug('Finished parsing PDF: %d', page_number)
|
|
||||||
|
|
||||||
return string_buffer.getvalue()
|
|
||||||
|
|
||||||
|
|
||||||
Parser.register(
|
Parser.register(
|
||||||
mimetypes=('application/pdf',),
|
mimetypes=('application/pdf',),
|
||||||
parser_classes=(PopplerParser, PDFMinerParser)
|
parser_classes=(PopplerParser,)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -9,3 +9,7 @@ namespace = PermissionNamespace('document_parsing', _('Document parsing'))
|
|||||||
permission_content_view = namespace.add_permission(
|
permission_content_view = namespace.add_permission(
|
||||||
name='content_view', label=_('View the content of a document')
|
name='content_view', label=_('View the content of a document')
|
||||||
)
|
)
|
||||||
|
|
||||||
|
permission_parse_document = namespace.add_permission(
|
||||||
|
name='parse_document', label=_('Parse the content of a document')
|
||||||
|
)
|
||||||
|
|||||||
@@ -4,7 +4,8 @@ from django.utils.translation import ugettext_lazy as _
|
|||||||
|
|
||||||
from task_manager.classes import CeleryQueue
|
from task_manager.classes import CeleryQueue
|
||||||
|
|
||||||
queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
|
queue_ocr = CeleryQueue(name='parsing', label=_('Parsing'))
|
||||||
queue_ocr.add_task_type(
|
queue_ocr.add_task_type(
|
||||||
name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
|
name='document_parsing.tasks.task_parse_document_version',
|
||||||
|
label=_('Document version parsing')
|
||||||
)
|
)
|
||||||
|
|||||||
29
mayan/apps/document_parsing/tasks.py
Normal file
29
mayan/apps/document_parsing/tasks.py
Normal file
@@ -0,0 +1,29 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
|
|
||||||
|
from mayan.celery import app
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@app.task(ignore_result=True)
|
||||||
|
def task_parse_document_version(document_version_pk):
|
||||||
|
DocumentVersion = apps.get_model(
|
||||||
|
app_label='documents', model_name='DocumentVersion'
|
||||||
|
)
|
||||||
|
DocumentPageContent = apps.get_model(
|
||||||
|
app_label='document_parsing', model_name='DocumentPageContent'
|
||||||
|
)
|
||||||
|
|
||||||
|
document_version = DocumentVersion.objects.get(
|
||||||
|
pk=document_version_pk
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
'Starting parsing for document version: %s', document_version
|
||||||
|
)
|
||||||
|
DocumentPageContent.objects.process_document_version(
|
||||||
|
document_version=document_version
|
||||||
|
)
|
||||||
@@ -1,88 +0,0 @@
|
|||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
import json
|
|
||||||
|
|
||||||
from django.contrib.auth import get_user_model
|
|
||||||
from django.urls import reverse
|
|
||||||
|
|
||||||
from rest_framework import status
|
|
||||||
|
|
||||||
from documents.models import DocumentType
|
|
||||||
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
|
||||||
from rest_api.tests import BaseAPITestCase
|
|
||||||
from user_management.tests import (
|
|
||||||
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class OCRAPITestCase(BaseAPITestCase):
|
|
||||||
"""
|
|
||||||
Test the OCR app API endpoints
|
|
||||||
"""
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
super(OCRAPITestCase, self).setUp()
|
|
||||||
|
|
||||||
self.admin_user = get_user_model().objects.create_superuser(
|
|
||||||
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
|
|
||||||
password=TEST_ADMIN_PASSWORD
|
|
||||||
)
|
|
||||||
|
|
||||||
self.client.login(
|
|
||||||
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
|
|
||||||
)
|
|
||||||
|
|
||||||
self.document_type = DocumentType.objects.create(
|
|
||||||
label=TEST_DOCUMENT_TYPE_LABEL
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
|
||||||
self.document = self.document_type.new_document(
|
|
||||||
file_object=file_object,
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
self.document_type.delete()
|
|
||||||
super(OCRAPITestCase, self).tearDown()
|
|
||||||
|
|
||||||
def test_submit_document(self):
|
|
||||||
response = self.client.post(
|
|
||||||
reverse(
|
|
||||||
'rest_api:document-ocr-submit-view',
|
|
||||||
args=(self.document.pk,)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
|
||||||
|
|
||||||
content = self.document.pages.first().ocr_content.content
|
|
||||||
|
|
||||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
|
||||||
|
|
||||||
def test_submit_document_version(self):
|
|
||||||
response = self.client.post(
|
|
||||||
reverse(
|
|
||||||
'rest_api:document-version-ocr-submit-view',
|
|
||||||
args=(self.document.latest_version.pk,)
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
|
||||||
|
|
||||||
content = self.document.pages.first().ocr_content.content
|
|
||||||
|
|
||||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
|
||||||
|
|
||||||
def test_get_document_version_page_content(self):
|
|
||||||
response = self.client.get(
|
|
||||||
reverse(
|
|
||||||
'rest_api:document-page-content-view',
|
|
||||||
args=(self.document.latest_version.pages.first().pk,)
|
|
||||||
),
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
'Mayan EDMS Documentation' in json.loads(response.content)['content']
|
|
||||||
)
|
|
||||||
@@ -2,40 +2,38 @@ from __future__ import unicode_literals
|
|||||||
|
|
||||||
from actstream.models import Action
|
from actstream.models import Action
|
||||||
|
|
||||||
|
from documents.tests.literals import TEST_DOCUMENT_FILENAME
|
||||||
from documents.tests.test_models import GenericDocumentTestCase
|
from documents.tests.test_models import GenericDocumentTestCase
|
||||||
|
|
||||||
from ..events import (
|
from ..events import (
|
||||||
event_ocr_document_version_submit, event_ocr_document_version_finish
|
event_parsing_document_version_submit,
|
||||||
|
event_parsing_document_version_finish
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
class OCREventsTestCase(GenericDocumentTestCase):
|
class DocumentParsingEventsTestCase(GenericDocumentTestCase):
|
||||||
|
# Ensure we use a PDF file
|
||||||
|
test_document_filename = TEST_DOCUMENT_FILENAME
|
||||||
|
|
||||||
def test_document_version_submit_event(self):
|
def test_document_version_submit_event(self):
|
||||||
Action.objects.all().delete()
|
Action.objects.all().delete()
|
||||||
self.document.submit_for_ocr()
|
self.document.submit_for_parsing()
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
Action.objects.first().target, self.document.latest_version
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
Action.objects.first().verb,
|
|
||||||
event_ocr_document_version_submit.name
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_document_version_finish_event(self):
|
|
||||||
Action.objects.all().delete()
|
|
||||||
self.document.submit_for_ocr()
|
|
||||||
from ..models import DocumentVersionOCRError, DocumentPageContent
|
|
||||||
#print DocumentVersionOCRError.objects.all()
|
|
||||||
print DocumentPageContent.objects.all()
|
|
||||||
|
|
||||||
for a in Action.objects.all():
|
|
||||||
print a
|
|
||||||
|
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
Action.objects.last().target, self.document.latest_version
|
Action.objects.last().target, self.document.latest_version
|
||||||
)
|
)
|
||||||
self.assertEqual(
|
self.assertEqual(
|
||||||
Action.objects.last().verb,
|
Action.objects.last().verb,
|
||||||
event_ocr_document_version_finish.name
|
event_parsing_document_version_submit.name
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_document_version_finish_event(self):
|
||||||
|
Action.objects.all().delete()
|
||||||
|
self.document.submit_for_parsing()
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.first().target, self.document.latest_version
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.first().verb,
|
||||||
|
event_parsing_document_version_finish.name
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,77 +0,0 @@
|
|||||||
# -*- coding: utf-8 -*-
|
|
||||||
|
|
||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from common.tests import BaseTestCase
|
|
||||||
from documents.models import DocumentType
|
|
||||||
from documents.settings import setting_language_choices
|
|
||||||
from documents.tests import (
|
|
||||||
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentOCRTestCase(BaseTestCase):
|
|
||||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
|
||||||
# Disable descriptor leak test until fixed in upstream
|
|
||||||
_skip_file_descriptor_test = True
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
super(DocumentOCRTestCase, self).setUp()
|
|
||||||
|
|
||||||
self.document_type = DocumentType.objects.create(
|
|
||||||
label=TEST_DOCUMENT_TYPE_LABEL
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
|
||||||
self.document = self.document_type.new_document(
|
|
||||||
file_object=file_object,
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
self.document.delete()
|
|
||||||
self.document_type.delete()
|
|
||||||
super(DocumentOCRTestCase, self).tearDown()
|
|
||||||
|
|
||||||
def test_ocr_language_backends_end(self):
|
|
||||||
content = self.document.pages.first().ocr_content.content
|
|
||||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
|
||||||
|
|
||||||
|
|
||||||
class GermanOCRSupportTestCase(BaseTestCase):
|
|
||||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
|
||||||
# Disable descriptor leak test until fixed in upstream
|
|
||||||
_skip_file_descriptor_test = True
|
|
||||||
|
|
||||||
def setUp(self):
|
|
||||||
super(GermanOCRSupportTestCase, self).setUp()
|
|
||||||
|
|
||||||
self.document_type = DocumentType.objects.create(
|
|
||||||
label=TEST_DOCUMENT_TYPE_LABEL
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get corresponding language code for German from the default language
|
|
||||||
# choices list
|
|
||||||
language_code = [
|
|
||||||
language for language in setting_language_choices.value if language[1] == 'German'
|
|
||||||
][0][0]
|
|
||||||
|
|
||||||
self.assertEqual('deu', language_code)
|
|
||||||
|
|
||||||
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
|
|
||||||
self.document = self.document_type.new_document(
|
|
||||||
file_object=file_object, language=language_code
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
self.document_type.delete()
|
|
||||||
super(GermanOCRSupportTestCase, self).tearDown()
|
|
||||||
|
|
||||||
def test_ocr_language_backends_end(self):
|
|
||||||
content = self.document.pages.first().ocr_content.content
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
'Repository für elektronische Dokumente.' in content
|
|
||||||
)
|
|
||||||
self.assertTrue(
|
|
||||||
'Es bietet einen' in content
|
|
||||||
)
|
|
||||||
@@ -5,12 +5,9 @@ from django.test import override_settings
|
|||||||
|
|
||||||
from common.tests import BaseTestCase
|
from common.tests import BaseTestCase
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
from documents.tests import (
|
from documents.tests import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL
|
||||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
from ..classes import TextExtractor
|
from ..parsers import PopplerParser
|
||||||
from ..parsers import PDFMinerParser, PopplerParser
|
|
||||||
|
|
||||||
|
|
||||||
@override_settings(OCR_AUTO_OCR=False)
|
@override_settings(OCR_AUTO_OCR=False)
|
||||||
@@ -30,54 +27,11 @@ class ParserTestCase(BaseTestCase):
|
|||||||
self.document_type.delete()
|
self.document_type.delete()
|
||||||
super(ParserTestCase, self).tearDown()
|
super(ParserTestCase, self).tearDown()
|
||||||
|
|
||||||
def test_pdfminer_parser(self):
|
|
||||||
parser = PDFMinerParser()
|
|
||||||
|
|
||||||
parser.process_document_version(self.document.latest_version)
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_poppler_parser(self):
|
def test_poppler_parser(self):
|
||||||
parser = PopplerParser()
|
parser = PopplerParser()
|
||||||
|
|
||||||
parser.process_document_version(self.document.latest_version)
|
parser.process_document_version(self.document.latest_version)
|
||||||
|
|
||||||
self.assertTrue(
|
self.assertTrue(
|
||||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
'Mayan EDMS Documentation' in self.document.pages.first().content.content
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@override_settings(OCR_AUTO_OCR=False)
|
|
||||||
class TextExtractorTestCase(BaseTestCase):
|
|
||||||
def setUp(self):
|
|
||||||
super(TextExtractorTestCase, self).setUp()
|
|
||||||
|
|
||||||
self.document_type = DocumentType.objects.create(
|
|
||||||
label=TEST_DOCUMENT_TYPE_LABEL
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
|
||||||
self.document = self.document_type.new_document(
|
|
||||||
file_object=File(file_object)
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
self.document_type.delete()
|
|
||||||
super(TextExtractorTestCase, self).tearDown()
|
|
||||||
|
|
||||||
def test_text_extractor(self):
|
|
||||||
TextExtractor.process_document_version(
|
|
||||||
document_version=self.document.latest_version
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
self.document.latest_version.pages.first().ocr_content.content,
|
|
||||||
'Sample text',
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
self.document.latest_version.pages.last().ocr_content.content,
|
|
||||||
'Sample text in image form',
|
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,26 +1,25 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
from django.test import override_settings
|
from documents.tests.literals import TEST_DOCUMENT_FILENAME
|
||||||
|
|
||||||
from documents.tests.test_views import GenericDocumentViewTestCase
|
from documents.tests.test_views import GenericDocumentViewTestCase
|
||||||
|
|
||||||
from ..permissions import permission_ocr_content_view
|
from ..permissions import permission_content_view
|
||||||
from ..utils import get_document_ocr_content
|
from ..utils import get_document_content
|
||||||
|
|
||||||
|
|
||||||
@override_settings(OCR_AUTO_OCR=True)
|
class DocumentContentViewsTestCase(GenericDocumentViewTestCase):
|
||||||
class OCRViewsTestCase(GenericDocumentViewTestCase):
|
|
||||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
|
||||||
# Disable descriptor leak test until fixed in upstream
|
|
||||||
_skip_file_descriptor_test = True
|
_skip_file_descriptor_test = True
|
||||||
|
|
||||||
|
# Ensure we use a PDF file
|
||||||
|
test_document_filename = TEST_DOCUMENT_FILENAME
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
super(OCRViewsTestCase, self).setUp()
|
super(DocumentContentViewsTestCase, self).setUp()
|
||||||
self.login_user()
|
self.login_user()
|
||||||
|
|
||||||
def _document_content_view(self):
|
def _document_content_view(self):
|
||||||
return self.get(
|
return self.get(
|
||||||
'ocr:document_content', args=(self.document.pk,)
|
'document_parsing:document_content', args=(self.document.pk,)
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_document_content_view_no_permissions(self):
|
def test_document_content_view_no_permissions(self):
|
||||||
@@ -29,7 +28,7 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
|
|||||||
self.assertEqual(response.status_code, 403)
|
self.assertEqual(response.status_code, 403)
|
||||||
|
|
||||||
def test_document_content_view_with_permission(self):
|
def test_document_content_view_with_permission(self):
|
||||||
self.grant_permission(permission=permission_ocr_content_view)
|
self.grant_permission(permission=permission_content_view)
|
||||||
|
|
||||||
response = self._document_content_view()
|
response = self._document_content_view()
|
||||||
|
|
||||||
@@ -37,25 +36,25 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
|
|||||||
response, 'Mayan EDMS Documentation', status_code=200
|
response, 'Mayan EDMS Documentation', status_code=200
|
||||||
)
|
)
|
||||||
|
|
||||||
def test_document_ocr_download_view_no_permission(self):
|
def test_document_parsing_download_view_no_permission(self):
|
||||||
response = self.get(
|
response = self.get(
|
||||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
'document_parsing:document_content_download', args=(self.document.pk,)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(response.status_code, 403)
|
self.assertEqual(response.status_code, 403)
|
||||||
|
|
||||||
def test_document_download_view_with_permission(self):
|
def test_download_view_with_permission(self):
|
||||||
self.expected_content_type = 'application/octet-stream; charset=utf-8'
|
self.expected_content_type = 'application/octet-stream; charset=utf-8'
|
||||||
|
|
||||||
self.grant_permission(permission=permission_ocr_content_view)
|
self.grant_permission(permission=permission_content_view)
|
||||||
response = self.get(
|
response = self.get(
|
||||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
'document_parsing:document_content_download', args=(self.document.pk,)
|
||||||
)
|
)
|
||||||
|
|
||||||
self.assertEqual(response.status_code, 200)
|
self.assertEqual(response.status_code, 200)
|
||||||
|
|
||||||
self.assert_download_response(
|
self.assert_download_response(
|
||||||
response, content=(
|
response, content=(
|
||||||
''.join(get_document_ocr_content(document=self.document))
|
''.join(get_document_content(document=self.document))
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -2,62 +2,43 @@ from __future__ import unicode_literals
|
|||||||
|
|
||||||
from django.conf.urls import url
|
from django.conf.urls import url
|
||||||
|
|
||||||
from .api_views import (
|
from .api_views import APIDocumentPageContentView
|
||||||
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
|
|
||||||
)
|
|
||||||
from .views import (
|
from .views import (
|
||||||
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
DocumentContentView, DocumentContentDownloadView,
|
||||||
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
|
DocumentParsingErrorsListView, DocumentSubmitView, DocumentTypeSubmitView,
|
||||||
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
|
ParseErrorListView
|
||||||
)
|
)
|
||||||
|
|
||||||
urlpatterns = [
|
urlpatterns = [
|
||||||
url(
|
url(
|
||||||
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
|
r'^documents/(?P<pk>\d+)/content/$', DocumentContentView.as_view(),
|
||||||
name='document_content'
|
name='document_content'
|
||||||
),
|
),
|
||||||
url(
|
url(
|
||||||
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
|
r'^documents/(?P<pk>\d+)/content/download/$',
|
||||||
name='document_submit'
|
DocumentContentDownloadView.as_view(), name='document_content_download'
|
||||||
),
|
),
|
||||||
url(
|
url(
|
||||||
r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
|
r'^document_types/submit/$', DocumentTypeSubmitView.as_view(),
|
||||||
name='document_submit_all'
|
|
||||||
),
|
|
||||||
url(
|
|
||||||
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
|
|
||||||
name='document_type_submit'
|
name='document_type_submit'
|
||||||
),
|
),
|
||||||
url(
|
url(
|
||||||
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
|
r'^documents/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
|
||||||
|
name='document_submit'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^documents/multiple/submit/$', DocumentSubmitView.as_view(),
|
||||||
name='document_submit_multiple'
|
name='document_submit_multiple'
|
||||||
),
|
),
|
||||||
url(
|
url(
|
||||||
r'^document_type/(?P<pk>\d+)/ocr/settings/$',
|
r'^documents/(?P<pk>\d+)/errors/$',
|
||||||
DocumentTypeSettingsEditView.as_view(),
|
DocumentParsingErrorsListView.as_view(),
|
||||||
name='document_type_ocr_settings'
|
name='document_parsing_error_list'
|
||||||
),
|
),
|
||||||
url(
|
url(r'^errors/all/$', ParseErrorListView.as_view(), name='error_list'),
|
||||||
r'^documents/(?P<pk>\d+)/ocr/errors/$',
|
|
||||||
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
|
|
||||||
),
|
|
||||||
url(
|
|
||||||
r'^documents/(?P<pk>\d+)/ocr/download/$',
|
|
||||||
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
|
|
||||||
),
|
|
||||||
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
api_urls = [
|
api_urls = [
|
||||||
url(
|
|
||||||
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
|
|
||||||
name='document-ocr-submit-view'
|
|
||||||
),
|
|
||||||
url(
|
|
||||||
r'^document_version/(?P<pk>\d+)/submit/$',
|
|
||||||
APIDocumentVersionOCRView.as_view(),
|
|
||||||
name='document-version-ocr-submit-view'
|
|
||||||
),
|
|
||||||
url(
|
url(
|
||||||
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
||||||
name='document-page-content-view'
|
name='document-page-content-view'
|
||||||
|
|||||||
@@ -6,10 +6,10 @@ from django.utils.html import conditional_escape
|
|||||||
from .models import DocumentPageContent
|
from .models import DocumentPageContent
|
||||||
|
|
||||||
|
|
||||||
def get_document_ocr_content(document):
|
def get_document_content(document):
|
||||||
for page in document.pages.all():
|
for page in document.pages.all():
|
||||||
try:
|
try:
|
||||||
page_content = page.ocr_content.content
|
page_content = page.content.content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentPageContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -4,137 +4,27 @@ from django.contrib import messages
|
|||||||
from django.http import HttpResponseRedirect
|
from django.http import HttpResponseRedirect
|
||||||
from django.shortcuts import get_object_or_404
|
from django.shortcuts import get_object_or_404
|
||||||
from django.urls import reverse
|
from django.urls import reverse
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _, ungettext
|
||||||
|
|
||||||
from acls.models import AccessControlList
|
|
||||||
from common.generics import (
|
from common.generics import (
|
||||||
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
|
FormView, MultipleObjectConfirmActionView, SingleObjectDetailView,
|
||||||
SingleObjectEditView, SingleObjectListView
|
SingleObjectDownloadView, SingleObjectListView
|
||||||
)
|
)
|
||||||
from common.mixins import MultipleInstanceActionMixin
|
from documents.models import Document
|
||||||
from documents.models import Document, DocumentType
|
|
||||||
|
|
||||||
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
||||||
from .models import DocumentVersionOCRError
|
from .models import DocumentVersionParseError
|
||||||
from .permissions import (
|
from .permissions import permission_content_view, permission_parse_document
|
||||||
permission_ocr_content_view, permission_ocr_document,
|
from .utils import get_document_content
|
||||||
permission_document_type_ocr_setup
|
|
||||||
)
|
|
||||||
from .utils import get_document_ocr_content
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentAllSubmitView(ConfirmView):
|
class DocumentContentView(SingleObjectDetailView):
|
||||||
extra_context = {'title': _('Submit all documents for OCR?')}
|
|
||||||
|
|
||||||
def get_post_action_redirect(self):
|
|
||||||
return reverse('common:tools_list')
|
|
||||||
|
|
||||||
def view_action(self):
|
|
||||||
count = 0
|
|
||||||
for document in Document.objects.all():
|
|
||||||
document.submit_for_ocr()
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
messages.success(
|
|
||||||
self.request, _('%d documents added to the OCR queue.') % count
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentSubmitView(ConfirmView):
|
|
||||||
def get_extra_context(self):
|
|
||||||
return {
|
|
||||||
'object': self.get_object(),
|
|
||||||
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_object(self):
|
|
||||||
return Document.objects.get(pk=self.kwargs['pk'])
|
|
||||||
|
|
||||||
def object_action(self, instance):
|
|
||||||
AccessControlList.objects.check_access(
|
|
||||||
permissions=permission_ocr_document, user=self.request.user,
|
|
||||||
obj=instance
|
|
||||||
)
|
|
||||||
|
|
||||||
instance.submit_for_ocr()
|
|
||||||
|
|
||||||
def view_action(self):
|
|
||||||
instance = self.get_object()
|
|
||||||
|
|
||||||
self.object_action(instance=instance)
|
|
||||||
|
|
||||||
messages.success(
|
|
||||||
self.request,
|
|
||||||
_('Document: %(document)s was added to the OCR queue.') % {
|
|
||||||
'document': instance
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
|
|
||||||
model = Document
|
|
||||||
success_message = '%(count)d document submitted to the OCR queue.'
|
|
||||||
success_message_plural = '%(count)d documents submitted to the OCR queue.'
|
|
||||||
|
|
||||||
def get_extra_context(self):
|
|
||||||
# Override the base class method
|
|
||||||
return {
|
|
||||||
'title': _('Submit the selected documents to the OCR queue?')
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentTypeSubmitView(FormView):
|
|
||||||
form_class = DocumentTypeSelectForm
|
|
||||||
extra_context = {
|
|
||||||
'title': _('Submit all documents of a type for OCR')
|
|
||||||
}
|
|
||||||
|
|
||||||
def get_post_action_redirect(self):
|
|
||||||
return reverse('common:tools_list')
|
|
||||||
|
|
||||||
def form_valid(self, form):
|
|
||||||
count = 0
|
|
||||||
for document in form.cleaned_data['document_type'].documents.all():
|
|
||||||
document.submit_for_ocr()
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
messages.success(
|
|
||||||
self.request, _(
|
|
||||||
'%(count)d documents of type "%(document_type)s" added to the '
|
|
||||||
'OCR queue.'
|
|
||||||
) % {
|
|
||||||
'count': count,
|
|
||||||
'document_type': form.cleaned_data['document_type']
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return HttpResponseRedirect(self.get_success_url())
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentTypeSettingsEditView(SingleObjectEditView):
|
|
||||||
fields = ('auto_ocr',)
|
|
||||||
view_permission = permission_document_type_ocr_setup
|
|
||||||
|
|
||||||
def get_object(self, queryset=None):
|
|
||||||
return get_object_or_404(
|
|
||||||
DocumentType, pk=self.kwargs['pk']
|
|
||||||
).ocr_settings
|
|
||||||
|
|
||||||
def get_extra_context(self):
|
|
||||||
return {
|
|
||||||
'title': _(
|
|
||||||
'Edit OCR settings for document type: %s'
|
|
||||||
) % self.get_object().document_type
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class DocumentOCRContent(SingleObjectDetailView):
|
|
||||||
form_class = DocumentContentForm
|
form_class = DocumentContentForm
|
||||||
model = Document
|
model = Document
|
||||||
object_permission = permission_ocr_content_view
|
object_permission = permission_content_view
|
||||||
|
|
||||||
def dispatch(self, request, *args, **kwargs):
|
def dispatch(self, request, *args, **kwargs):
|
||||||
result = super(DocumentOCRContent, self).dispatch(
|
result = super(DocumentContentView, self).dispatch(
|
||||||
request, *args, **kwargs
|
request, *args, **kwargs
|
||||||
)
|
)
|
||||||
self.get_object().add_as_recent_document_for_user(request.user)
|
self.get_object().add_as_recent_document_for_user(request.user)
|
||||||
@@ -145,23 +35,25 @@ class DocumentOCRContent(SingleObjectDetailView):
|
|||||||
'document': self.get_object(),
|
'document': self.get_object(),
|
||||||
'hide_labels': True,
|
'hide_labels': True,
|
||||||
'object': self.get_object(),
|
'object': self.get_object(),
|
||||||
'title': _('OCR result for document: %s') % self.get_object(),
|
'title': _('Content for document: %s') % self.get_object(),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class EntryListView(SingleObjectListView):
|
class DocumentContentDownloadView(SingleObjectDownloadView):
|
||||||
extra_context = {
|
model = Document
|
||||||
'hide_object': True,
|
object_permission = permission_content_view
|
||||||
'title': _('OCR errors'),
|
|
||||||
}
|
|
||||||
view_permission = permission_ocr_document
|
|
||||||
|
|
||||||
def get_object_list(self):
|
def get_file(self):
|
||||||
return DocumentVersionOCRError.objects.all()
|
file_object = DocumentContentDownloadView.TextIteratorIO(
|
||||||
|
iterator=get_document_content(document=self.get_object())
|
||||||
|
)
|
||||||
|
return DocumentContentDownloadView.VirtualFile(
|
||||||
|
file=file_object, name='{}-content'.format(self.get_object())
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class DocumentOCRErrorsListView(SingleObjectListView):
|
class DocumentParsingErrorsListView(SingleObjectListView):
|
||||||
view_permission = permission_ocr_document
|
view_permission = permission_content_view
|
||||||
|
|
||||||
def get_document(self):
|
def get_document(self):
|
||||||
return get_object_or_404(Document, pk=self.kwargs['pk'])
|
return get_object_or_404(Document, pk=self.kwargs['pk'])
|
||||||
@@ -170,21 +62,93 @@ class DocumentOCRErrorsListView(SingleObjectListView):
|
|||||||
return {
|
return {
|
||||||
'hide_object': True,
|
'hide_object': True,
|
||||||
'object': self.get_document(),
|
'object': self.get_document(),
|
||||||
'title': _('OCR errors for document: %s') % self.get_document(),
|
'title': _(
|
||||||
|
'Parsing errors for document: %s'
|
||||||
|
) % self.get_document(),
|
||||||
}
|
}
|
||||||
|
|
||||||
def get_object_list(self):
|
def get_object_list(self):
|
||||||
return self.get_document().latest_version.ocr_errors.all()
|
return self.get_document().latest_version.parsing_errors.all()
|
||||||
|
|
||||||
|
|
||||||
class DocumentOCRDownloadView(SingleObjectDownloadView):
|
class DocumentSubmitView(MultipleObjectConfirmActionView):
|
||||||
model = Document
|
model = Document
|
||||||
object_permission = permission_ocr_content_view
|
object_permission = permission_parse_document
|
||||||
|
success_message = _(
|
||||||
|
'%(count)d document added to the parsing queue'
|
||||||
|
)
|
||||||
|
success_message_plural = _(
|
||||||
|
'%(count)d documents added to the parsing queue'
|
||||||
|
)
|
||||||
|
|
||||||
def get_file(self):
|
def get_extra_context(self):
|
||||||
file_object = DocumentOCRDownloadView.TextIteratorIO(
|
queryset = self.get_queryset()
|
||||||
iterator=get_document_ocr_content(document=self.get_object())
|
|
||||||
)
|
result = {
|
||||||
return DocumentOCRDownloadView.VirtualFile(
|
'title': ungettext(
|
||||||
file=file_object, name='{}-OCR'.format(self.get_object())
|
singular='Submit %(count)d document to the parsing queue?',
|
||||||
|
plural='Submit %(count)d documents to the parsing queue',
|
||||||
|
number=queryset.count()
|
||||||
|
) % {
|
||||||
|
'count': queryset.count(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if queryset.count() == 1:
|
||||||
|
result.update(
|
||||||
|
{
|
||||||
|
'object': queryset.first(),
|
||||||
|
'title': _(
|
||||||
|
'Submit document "%s" to the parsing queue'
|
||||||
|
) % queryset.first()
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def object_action(self, instance, form=None):
|
||||||
|
instance.submit_for_parsing()
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentTypeSubmitView(FormView):
|
||||||
|
form_class = DocumentTypeSelectForm
|
||||||
|
extra_context = {
|
||||||
|
'title': _('Submit all documents of a type for parsing')
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_form_extra_kwargs(self):
|
||||||
|
return {
|
||||||
|
'user': self.request.user
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_post_action_redirect(self):
|
||||||
|
return reverse('common:tools_list')
|
||||||
|
|
||||||
|
def form_valid(self, form):
|
||||||
|
count = 0
|
||||||
|
for document in form.cleaned_data['document_type'].documents.all():
|
||||||
|
document.submit_for_parsing()
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
messages.success(
|
||||||
|
self.request, _(
|
||||||
|
'%(count)d documents of type "%(document_type)s" added to the '
|
||||||
|
'parsing queue.'
|
||||||
|
) % {
|
||||||
|
'count': count,
|
||||||
|
'document_type': form.cleaned_data['document_type']
|
||||||
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
return HttpResponseRedirect(self.get_success_url())
|
||||||
|
|
||||||
|
|
||||||
|
class ParseErrorListView(SingleObjectListView):
|
||||||
|
extra_context = {
|
||||||
|
'hide_object': True,
|
||||||
|
'title': _('Parsing errors'),
|
||||||
|
}
|
||||||
|
view_permission = permission_content_view
|
||||||
|
|
||||||
|
def get_object_list(self):
|
||||||
|
return DocumentVersionParseError.objects.all()
|
||||||
|
|||||||
21
mayan/apps/documents/migrations/0041_auto_20170823_1855.py
Normal file
21
mayan/apps/documents/migrations/0041_auto_20170823_1855.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.10.7 on 2017-08-23 18:55
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import uuid
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0040_auto_20170725_1111'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='document',
|
||||||
|
name='uuid',
|
||||||
|
field=models.UUIDField(default=uuid.uuid4, editable=False),
|
||||||
|
),
|
||||||
|
]
|
||||||
Reference in New Issue
Block a user