Finish the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-24 03:54:07 -04:00
parent e9591c92f9
commit a7eaf6b368
25 changed files with 423 additions and 639 deletions

View File

@@ -4,137 +4,27 @@ from django.contrib import messages
from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404
from django.urls import reverse
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext_lazy as _, ungettext
from acls.models import AccessControlList
from common.generics import (
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
SingleObjectEditView, SingleObjectListView
FormView, MultipleObjectConfirmActionView, SingleObjectDetailView,
SingleObjectDownloadView, SingleObjectListView
)
from common.mixins import MultipleInstanceActionMixin
from documents.models import Document, DocumentType
from documents.models import Document
from .forms import DocumentContentForm, DocumentTypeSelectForm
from .models import DocumentVersionOCRError
from .permissions import (
permission_ocr_content_view, permission_ocr_document,
permission_document_type_ocr_setup
)
from .utils import get_document_ocr_content
from .models import DocumentVersionParseError
from .permissions import permission_content_view, permission_parse_document
from .utils import get_document_content
class DocumentAllSubmitView(ConfirmView):
extra_context = {'title': _('Submit all documents for OCR?')}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def view_action(self):
count = 0
for document in Document.objects.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _('%d documents added to the OCR queue.') % count
)
class DocumentSubmitView(ConfirmView):
def get_extra_context(self):
return {
'object': self.get_object(),
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
}
def get_object(self):
return Document.objects.get(pk=self.kwargs['pk'])
def object_action(self, instance):
AccessControlList.objects.check_access(
permissions=permission_ocr_document, user=self.request.user,
obj=instance
)
instance.submit_for_ocr()
def view_action(self):
instance = self.get_object()
self.object_action(instance=instance)
messages.success(
self.request,
_('Document: %(document)s was added to the OCR queue.') % {
'document': instance
}
)
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
model = Document
success_message = '%(count)d document submitted to the OCR queue.'
success_message_plural = '%(count)d documents submitted to the OCR queue.'
def get_extra_context(self):
# Override the base class method
return {
'title': _('Submit the selected documents to the OCR queue?')
}
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for OCR')
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'OCR queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
)
return HttpResponseRedirect(self.get_success_url())
class DocumentTypeSettingsEditView(SingleObjectEditView):
fields = ('auto_ocr',)
view_permission = permission_document_type_ocr_setup
def get_object(self, queryset=None):
return get_object_or_404(
DocumentType, pk=self.kwargs['pk']
).ocr_settings
def get_extra_context(self):
return {
'title': _(
'Edit OCR settings for document type: %s'
) % self.get_object().document_type
}
class DocumentOCRContent(SingleObjectDetailView):
class DocumentContentView(SingleObjectDetailView):
form_class = DocumentContentForm
model = Document
object_permission = permission_ocr_content_view
object_permission = permission_content_view
def dispatch(self, request, *args, **kwargs):
result = super(DocumentOCRContent, self).dispatch(
result = super(DocumentContentView, self).dispatch(
request, *args, **kwargs
)
self.get_object().add_as_recent_document_for_user(request.user)
@@ -145,23 +35,25 @@ class DocumentOCRContent(SingleObjectDetailView):
'document': self.get_object(),
'hide_labels': True,
'object': self.get_object(),
'title': _('OCR result for document: %s') % self.get_object(),
'title': _('Content for document: %s') % self.get_object(),
}
class EntryListView(SingleObjectListView):
extra_context = {
'hide_object': True,
'title': _('OCR errors'),
}
view_permission = permission_ocr_document
class DocumentContentDownloadView(SingleObjectDownloadView):
model = Document
object_permission = permission_content_view
def get_object_list(self):
return DocumentVersionOCRError.objects.all()
def get_file(self):
file_object = DocumentContentDownloadView.TextIteratorIO(
iterator=get_document_content(document=self.get_object())
)
return DocumentContentDownloadView.VirtualFile(
file=file_object, name='{}-content'.format(self.get_object())
)
class DocumentOCRErrorsListView(SingleObjectListView):
view_permission = permission_ocr_document
class DocumentParsingErrorsListView(SingleObjectListView):
view_permission = permission_content_view
def get_document(self):
return get_object_or_404(Document, pk=self.kwargs['pk'])
@@ -170,21 +62,93 @@ class DocumentOCRErrorsListView(SingleObjectListView):
return {
'hide_object': True,
'object': self.get_document(),
'title': _('OCR errors for document: %s') % self.get_document(),
'title': _(
'Parsing errors for document: %s'
) % self.get_document(),
}
def get_object_list(self):
return self.get_document().latest_version.ocr_errors.all()
return self.get_document().latest_version.parsing_errors.all()
class DocumentOCRDownloadView(SingleObjectDownloadView):
class DocumentSubmitView(MultipleObjectConfirmActionView):
model = Document
object_permission = permission_ocr_content_view
object_permission = permission_parse_document
success_message = _(
'%(count)d document added to the parsing queue'
)
success_message_plural = _(
'%(count)d documents added to the parsing queue'
)
def get_file(self):
file_object = DocumentOCRDownloadView.TextIteratorIO(
iterator=get_document_ocr_content(document=self.get_object())
)
return DocumentOCRDownloadView.VirtualFile(
file=file_object, name='{}-OCR'.format(self.get_object())
def get_extra_context(self):
queryset = self.get_queryset()
result = {
'title': ungettext(
singular='Submit %(count)d document to the parsing queue?',
plural='Submit %(count)d documents to the parsing queue',
number=queryset.count()
) % {
'count': queryset.count(),
}
}
if queryset.count() == 1:
result.update(
{
'object': queryset.first(),
'title': _(
'Submit document "%s" to the parsing queue'
) % queryset.first()
}
)
return result
def object_action(self, instance, form=None):
instance.submit_for_parsing()
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for parsing')
}
def get_form_extra_kwargs(self):
return {
'user': self.request.user
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_parsing()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'parsing queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
)
return HttpResponseRedirect(self.get_success_url())
class ParseErrorListView(SingleObjectListView):
extra_context = {
'hide_object': True,
'title': _('Parsing errors'),
}
view_permission = permission_content_view
def get_object_list(self):
return DocumentVersionParseError.objects.all()