Finish the document parsing app.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import (
|
||||
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
|
||||
DocumentPageContent, DocumentVersionParseError
|
||||
)
|
||||
|
||||
|
||||
@@ -12,12 +12,7 @@ class DocumentPageContentAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_page',)
|
||||
|
||||
|
||||
@admin.register(DocumentTypeSettings)
|
||||
class DocumentTypeSettingsAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_type', 'auto_ocr')
|
||||
|
||||
|
||||
@admin.register(DocumentVersionOCRError)
|
||||
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
|
||||
@admin.register(DocumentVersionParseError)
|
||||
class DocumentVersionParseErrorAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_version', 'datetime_submitted')
|
||||
readonly_fields = ('document_version', 'datetime_submitted', 'result')
|
||||
|
||||
@@ -1,75 +1,19 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from rest_framework import generics, status
|
||||
from rest_framework import generics
|
||||
from rest_framework.response import Response
|
||||
|
||||
from documents.models import Document, DocumentPage, DocumentVersion
|
||||
from documents.models import DocumentPage
|
||||
from rest_api.permissions import MayanPermission
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .permissions import permission_ocr_content_view, permission_ocr_document
|
||||
from .permissions import permission_content_view
|
||||
from .serializers import DocumentPageContentSerializer
|
||||
|
||||
|
||||
class APIDocumentOCRView(generics.GenericAPIView):
|
||||
mayan_object_permissions = {
|
||||
'POST': (permission_ocr_document,)
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
queryset = Document.objects.all()
|
||||
|
||||
def get_serializer_class(self):
|
||||
return None
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
"""
|
||||
Submit a document for OCR.
|
||||
---
|
||||
omit_serializer: true
|
||||
parameters:
|
||||
- name: pk
|
||||
paramType: path
|
||||
type: number
|
||||
responseMessages:
|
||||
- code: 202
|
||||
message: Accepted
|
||||
"""
|
||||
|
||||
self.get_object().submit_for_ocr()
|
||||
return Response(status=status.HTTP_202_ACCEPTED)
|
||||
|
||||
|
||||
class APIDocumentVersionOCRView(generics.GenericAPIView):
|
||||
mayan_object_permissions = {
|
||||
'POST': (permission_ocr_document,)
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
queryset = DocumentVersion.objects.all()
|
||||
|
||||
def get_serializer_class(self):
|
||||
return None
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
"""
|
||||
Submit a document version for OCR.
|
||||
---
|
||||
omit_serializer: true
|
||||
parameters:
|
||||
- name: pk
|
||||
paramType: path
|
||||
type: number
|
||||
responseMessages:
|
||||
- code: 202
|
||||
message: Accepted
|
||||
"""
|
||||
|
||||
self.get_object().submit_for_ocr()
|
||||
return Response(status=status.HTTP_202_ACCEPTED)
|
||||
|
||||
|
||||
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
"""
|
||||
Returns the OCR content of the selected document page.
|
||||
Returns the content of the selected document page.
|
||||
---
|
||||
GET:
|
||||
parameters:
|
||||
@@ -79,7 +23,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
"""
|
||||
|
||||
mayan_object_permissions = {
|
||||
'GET': (permission_ocr_content_view,),
|
||||
'GET': (permission_content_view,),
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
serializer_class = DocumentPageContentSerializer
|
||||
@@ -89,9 +33,9 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
instance = self.get_object()
|
||||
|
||||
try:
|
||||
ocr_content = instance.ocr_content
|
||||
content = instance.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
ocr_content = DocumentPageContent.objects.none()
|
||||
content = DocumentPageContent.objects.none()
|
||||
|
||||
serializer = self.get_serializer(ocr_content)
|
||||
serializer = self.get_serializer(content)
|
||||
return Response(serializer.data)
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
|
||||
from kombu import Exchange, Queue
|
||||
|
||||
from django.apps import apps
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils.timezone import now
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from acls import ModelPermission
|
||||
@@ -21,16 +22,38 @@ from mayan.celery import app
|
||||
from navigation import SourceColumn
|
||||
from rest_api.classes import APIEndPoint
|
||||
|
||||
from .events import event_parsing_document_version_submit
|
||||
from .handlers import handler_parse_document_version
|
||||
from .links import (
|
||||
link_document_content, link_entry_list, link_document_content_errors_list,
|
||||
link_document_content_download
|
||||
link_document_content, link_document_content_download,
|
||||
link_document_parsing_errors_list, link_document_submit_multiple,
|
||||
link_document_submit, link_document_type_submit, link_error_list
|
||||
)
|
||||
from .permissions import permission_content_view
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def document_parsing_submit(self):
|
||||
latest_version = self.latest_version
|
||||
# Don't error out if document has no version
|
||||
if latest_version:
|
||||
latest_version.submit_for_parsing()
|
||||
|
||||
|
||||
def document_version_parsing_submit(self):
|
||||
from .tasks import task_parse_document_version
|
||||
|
||||
event_parsing_document_version_submit.commit(
|
||||
action_object=self.document, target=self
|
||||
)
|
||||
|
||||
task_parse_document_version.apply_async(
|
||||
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
|
||||
kwargs={'document_version_pk': self.pk},
|
||||
)
|
||||
|
||||
|
||||
class DocumentParsingApp(MayanAppConfig):
|
||||
has_tests = True
|
||||
name = 'document_parsing'
|
||||
@@ -45,16 +68,17 @@ class DocumentParsingApp(MayanAppConfig):
|
||||
app_label='documents', model_name='Document'
|
||||
)
|
||||
|
||||
DocumentType = apps.get_model(
|
||||
app_label='documents', model_name='DocumentType'
|
||||
)
|
||||
|
||||
DocumentVersion = apps.get_model(
|
||||
app_label='documents', model_name='DocumentVersion'
|
||||
)
|
||||
|
||||
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
|
||||
|
||||
Document.add_to_class('submit_for_parsing', document_parsing_submit)
|
||||
DocumentVersion.add_to_class(
|
||||
'submit_for_parsing', document_version_parsing_submit
|
||||
)
|
||||
|
||||
ModelPermission.register(
|
||||
model=Document, permissions=(permission_content_view,)
|
||||
)
|
||||
@@ -72,6 +96,18 @@ class DocumentParsingApp(MayanAppConfig):
|
||||
attribute='result'
|
||||
)
|
||||
|
||||
app.conf.CELERY_QUEUES.append(
|
||||
Queue('parsing', Exchange('parsing'), routing_key='parsing'),
|
||||
)
|
||||
|
||||
app.conf.CELERY_ROUTES.update(
|
||||
{
|
||||
'document_parsing.tasks.task_parse_document_version': {
|
||||
'queue': 'parsing'
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
document_search.add_model_field(
|
||||
field='versions__pages__content__content', label=_('Content')
|
||||
)
|
||||
@@ -89,32 +125,20 @@ class DocumentParsingApp(MayanAppConfig):
|
||||
menu_object.bind_links(
|
||||
links=(link_document_submit,), sources=(Document,)
|
||||
)
|
||||
menu_object.bind_links(
|
||||
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
|
||||
)
|
||||
menu_secondary.bind_links(
|
||||
links=(
|
||||
link_document_content, link_document_ocr_erros_list,
|
||||
link_document_ocr_download
|
||||
link_document_content, link_document_parsing_errors_list,
|
||||
link_document_content_download
|
||||
),
|
||||
sources=(
|
||||
'document_parsing:document_content',
|
||||
'document_parsing:document_ocr_error_list',
|
||||
'document_parsing:document_ocr_download',
|
||||
)
|
||||
)
|
||||
menu_secondary.bind_links(
|
||||
links=(link_entry_list,),
|
||||
sources=(
|
||||
'document_parsing:entry_list',
|
||||
'document_parsing:entry_delete_multiple',
|
||||
'document_parsing:entry_re_queue_multiple',
|
||||
DocumentVersionParseError
|
||||
'document_parsing:document_content_download',
|
||||
'document_parsing:document_parsing_error_list',
|
||||
)
|
||||
)
|
||||
menu_tools.bind_links(
|
||||
links=(
|
||||
link_entry_list
|
||||
link_document_type_submit, link_error_list,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
14
mayan/apps/document_parsing/events.py
Normal file
14
mayan/apps/document_parsing/events.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from events.classes import Event
|
||||
|
||||
event_parsing_document_version_submit = Event(
|
||||
name='parsing_document_version_submit',
|
||||
label=_('Document version submitted for parsing')
|
||||
)
|
||||
event_parsing_document_version_finish = Event(
|
||||
name='parsing_document_version_finish',
|
||||
label=_('Document version parsing finished')
|
||||
)
|
||||
@@ -1,13 +1,6 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
"""
|
||||
Raised by the OCR backend
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParserError(Exception):
|
||||
"""
|
||||
Base exception for file parsers
|
||||
|
||||
@@ -6,10 +6,12 @@ from django.utils.html import conditional_escape
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.utils.translation import ugettext_lazy as _, ugettext
|
||||
|
||||
from acls.models import AccessControlList
|
||||
from common.widgets import TextAreaDiv
|
||||
from documents.models import DocumentType
|
||||
|
||||
from .models import DocumentPageContent, DocumentPageOCRContent
|
||||
from .models import DocumentPageContent
|
||||
from .permissions import permission_parse_document
|
||||
|
||||
|
||||
class DocumentContentForm(forms.Form):
|
||||
@@ -29,7 +31,7 @@ class DocumentContentForm(forms.Form):
|
||||
|
||||
for page in document_pages:
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
page_content = page.content.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
@@ -55,50 +57,16 @@ class DocumentContentForm(forms.Form):
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRContentForm(forms.Form):
|
||||
"""
|
||||
Form that concatenates all of a document pages' text content into a
|
||||
single textarea widget
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.document = kwargs.pop('instance', None)
|
||||
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
||||
content = []
|
||||
self.fields['contents'].initial = ''
|
||||
try:
|
||||
document_pages = self.document.pages.all()
|
||||
except AttributeError:
|
||||
document_pages = []
|
||||
|
||||
for page in document_pages:
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageOCRContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
content.append(conditional_escape(force_text(page_content)))
|
||||
content.append(
|
||||
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
||||
ugettext(
|
||||
'Page %(page_number)d'
|
||||
) % {'page_number': page.page_number}
|
||||
)
|
||||
)
|
||||
|
||||
self.fields['contents'].initial = mark_safe(''.join(content))
|
||||
|
||||
contents = forms.CharField(
|
||||
label=_('Contents'),
|
||||
widget=TextAreaDiv(
|
||||
attrs={
|
||||
'class': 'text_area_div full-height',
|
||||
'data-height-difference': 360
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class DocumentTypeSelectForm(forms.Form):
|
||||
document_type = forms.ModelChoiceField(
|
||||
queryset=DocumentType.objects.all(), label=('Document type')
|
||||
queryset=DocumentType.objects.none(), label=('Document type')
|
||||
)
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
user = kwargs.pop('user')
|
||||
super(DocumentTypeSelectForm, self).__init__(*args, **kwargs)
|
||||
queryset = AccessControlList.objects.filter_by_access(
|
||||
permission=permission_parse_document,
|
||||
queryset=DocumentType.objects.all(), user=user,
|
||||
)
|
||||
self.fields['document_type'].queryset = queryset
|
||||
|
||||
@@ -2,14 +2,8 @@ from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.apps import apps
|
||||
|
||||
from .settings import setting_auto_ocr
|
||||
from .parsers import Parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def handler_parse_document_version(sender, instance, **kwargs):
|
||||
if kwargs['created']:
|
||||
Parser.parse_document_version(document_version=instance)
|
||||
instance.submit_for_parsing()
|
||||
|
||||
@@ -4,24 +4,36 @@ from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from navigation import Link
|
||||
|
||||
from .permissions import permission_content_view
|
||||
from .permissions import permission_content_view, permission_parse_document
|
||||
|
||||
link_document_content = Link(
|
||||
args='resolved_object.id', icon='fa fa-font',
|
||||
permissions=(permission_content_view,), text=_('Content'),
|
||||
view='document_parsing:document_content',
|
||||
)
|
||||
link_entry_list = Link(
|
||||
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
|
||||
text=_('Parsing errors'), view='document_parsing:entry_list'
|
||||
)
|
||||
link_document_content_errors_list = Link(
|
||||
link_document_parsing_errors_list = Link(
|
||||
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||
permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
|
||||
view='document_parsing:document_page_parsing_error_list'
|
||||
permissions=(permission_content_view,), text=_('Parsing errors'),
|
||||
view='document_parsing:document_parsing_error_list'
|
||||
)
|
||||
link_document_content_download = Link(
|
||||
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||
permissions=(permission_ocr_content_view,), text=_('Download content'),
|
||||
permissions=(permission_content_view,), text=_('Download content'),
|
||||
view='document_parsing:document_content_download'
|
||||
)
|
||||
link_document_submit_multiple = Link(
|
||||
text=_('Submit for parsing'),
|
||||
view='document_parsing:document_submit_multiple'
|
||||
)
|
||||
link_document_submit = Link(
|
||||
args='resolved_object.id', permissions=(permission_parse_document,),
|
||||
text=_('Submit for parsing'), view='document_parsing:document_submit'
|
||||
)
|
||||
link_document_type_submit = Link(
|
||||
icon='fa fa-crosshairs', text=_('Parse documents per type'),
|
||||
view='document_parsing:document_type_submit'
|
||||
)
|
||||
link_error_list = Link(
|
||||
icon='fa fa-file-text-o', permissions=(permission_content_view,),
|
||||
text=_('Parsing errors'), view='document_parsing:error_list'
|
||||
)
|
||||
|
||||
@@ -1,14 +1,50 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from django.apps import apps
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
from django.utils.timezone import now
|
||||
|
||||
from .events import event_parsing_document_version_finish
|
||||
from .parsers import Parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentPageContentManager(models.Manager):
|
||||
pass
|
||||
def process_document_version(self, document_version):
|
||||
logger.info(
|
||||
'Starting parsing for document version: %s', document_version
|
||||
)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
try:
|
||||
Parser.parse_document_version(document_version=document_version)
|
||||
except Exception as exception:
|
||||
logger.exception(
|
||||
'Parsing error for document version: %d; %s',
|
||||
document_version.pk, exception,
|
||||
)
|
||||
|
||||
if settings.DEBUG:
|
||||
result = []
|
||||
type, value, tb = sys.exc_info()
|
||||
result.append('%s: %s' % (type.__name__, value))
|
||||
result.extend(traceback.format_tb(tb))
|
||||
document_version.parsing_errors.create(
|
||||
result='\n'.join(result)
|
||||
)
|
||||
else:
|
||||
document_version.parsing_errors.create(result=exception)
|
||||
else:
|
||||
logger.info(
|
||||
'Parsing complete for document version: %s', document_version
|
||||
)
|
||||
document_version.parsing_errors.all().delete()
|
||||
|
||||
event_parsing_document_version_finish.commit(
|
||||
action_object=document_version.document,
|
||||
target=document_version
|
||||
)
|
||||
|
||||
44
mayan/apps/document_parsing/migrations/0001_initial.py
Normal file
44
mayan/apps/document_parsing/migrations/0001_initial.py
Normal file
@@ -0,0 +1,44 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.7 on 2017-08-23 18:55
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
('documents', '0041_auto_20170823_1855'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='DocumentPageContent',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('content', models.TextField(blank=True, verbose_name='Content')),
|
||||
('document_page', models.OneToOneField(on_delete=django.db.models.deletion.CASCADE, related_name='content', to='documents.DocumentPage', verbose_name='Document page')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Document page content',
|
||||
'verbose_name_plural': 'Document pages contents',
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='DocumentVersionParseError',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('datetime_submitted', models.DateTimeField(auto_now_add=True, db_index=True, verbose_name='Date time submitted')),
|
||||
('result', models.TextField(blank=True, null=True, verbose_name='Result')),
|
||||
('document_version', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='parse_errors', to='documents.DocumentVersion', verbose_name='Document version')),
|
||||
],
|
||||
options={
|
||||
'ordering': ('datetime_submitted',),
|
||||
'verbose_name': 'Document version parse error',
|
||||
'verbose_name_plural': 'Document version parse errors',
|
||||
},
|
||||
),
|
||||
]
|
||||
0
mayan/apps/document_parsing/migrations/__init__.py
Normal file
0
mayan/apps/document_parsing/migrations/__init__.py
Normal file
@@ -4,7 +4,7 @@ from django.db import models
|
||||
from django.utils.encoding import force_text, python_2_unicode_compatible
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
||||
from documents.models import DocumentPage, DocumentVersion
|
||||
|
||||
from .managers import DocumentPageContentManager
|
||||
|
||||
@@ -30,11 +30,11 @@ class DocumentPageContent(models.Model):
|
||||
@python_2_unicode_compatible
|
||||
class DocumentVersionParseError(models.Model):
|
||||
document_version = models.ForeignKey(
|
||||
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
|
||||
verbose_name=_('Document version')
|
||||
DocumentVersion, on_delete=models.CASCADE,
|
||||
related_name='parsing_errors', verbose_name=_('Document version')
|
||||
)
|
||||
datetime_submitted = models.DateTimeField(
|
||||
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
|
||||
auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
|
||||
)
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||
|
||||
|
||||
@@ -1,20 +1,15 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import os
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
import subprocess
|
||||
|
||||
from django.apps import apps
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import copyfile, fs_cleanup, mkstemp
|
||||
|
||||
from .exceptions import ParserError, NoMIMETypeMatch
|
||||
from .models import DocumentPageContent
|
||||
from .settings import setting_pdftotext_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -82,6 +77,10 @@ class Parser(object):
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
DocumentPageContent = apps.get_model(
|
||||
app_label='document_parsing', model_name='DocumentPageContent'
|
||||
)
|
||||
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
@@ -171,32 +170,7 @@ class PopplerParser(Parser):
|
||||
return output
|
||||
|
||||
|
||||
class PDFMinerParser(Parser):
|
||||
"""
|
||||
Parser for PDF files using the PDFMiner library for Python
|
||||
"""
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
with BytesIO() as string_buffer:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = TextConverter(
|
||||
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
||||
)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
page = PDFPage.get_pages(
|
||||
file_object, maxpages=1, pagenos=(page_number - 1,)
|
||||
)
|
||||
interpreter.process_page(page.next())
|
||||
device.close()
|
||||
|
||||
logger.debug('Finished parsing PDF: %d', page_number)
|
||||
|
||||
return string_buffer.getvalue()
|
||||
|
||||
|
||||
Parser.register(
|
||||
mimetypes=('application/pdf',),
|
||||
parser_classes=(PopplerParser, PDFMinerParser)
|
||||
parser_classes=(PopplerParser,)
|
||||
)
|
||||
|
||||
@@ -9,3 +9,7 @@ namespace = PermissionNamespace('document_parsing', _('Document parsing'))
|
||||
permission_content_view = namespace.add_permission(
|
||||
name='content_view', label=_('View the content of a document')
|
||||
)
|
||||
|
||||
permission_parse_document = namespace.add_permission(
|
||||
name='parse_document', label=_('Parse the content of a document')
|
||||
)
|
||||
|
||||
@@ -4,7 +4,8 @@ from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from task_manager.classes import CeleryQueue
|
||||
|
||||
queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
|
||||
queue_ocr = CeleryQueue(name='parsing', label=_('Parsing'))
|
||||
queue_ocr.add_task_type(
|
||||
name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
|
||||
name='document_parsing.tasks.task_parse_document_version',
|
||||
label=_('Document version parsing')
|
||||
)
|
||||
|
||||
29
mayan/apps/document_parsing/tasks.py
Normal file
29
mayan/apps/document_parsing/tasks.py
Normal file
@@ -0,0 +1,29 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.apps import apps
|
||||
|
||||
from mayan.celery import app
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@app.task(ignore_result=True)
|
||||
def task_parse_document_version(document_version_pk):
|
||||
DocumentVersion = apps.get_model(
|
||||
app_label='documents', model_name='DocumentVersion'
|
||||
)
|
||||
DocumentPageContent = apps.get_model(
|
||||
app_label='document_parsing', model_name='DocumentPageContent'
|
||||
)
|
||||
|
||||
document_version = DocumentVersion.objects.get(
|
||||
pk=document_version_pk
|
||||
)
|
||||
logger.info(
|
||||
'Starting parsing for document version: %s', document_version
|
||||
)
|
||||
DocumentPageContent.objects.process_document_version(
|
||||
document_version=document_version
|
||||
)
|
||||
@@ -1,88 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.urls import reverse
|
||||
|
||||
from rest_framework import status
|
||||
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||
from rest_api.tests import BaseAPITestCase
|
||||
from user_management.tests import (
|
||||
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
|
||||
)
|
||||
|
||||
|
||||
class OCRAPITestCase(BaseAPITestCase):
|
||||
"""
|
||||
Test the OCR app API endpoints
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super(OCRAPITestCase, self).setUp()
|
||||
|
||||
self.admin_user = get_user_model().objects.create_superuser(
|
||||
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
|
||||
password=TEST_ADMIN_PASSWORD
|
||||
)
|
||||
|
||||
self.client.login(
|
||||
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
|
||||
)
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(OCRAPITestCase, self).tearDown()
|
||||
|
||||
def test_submit_document(self):
|
||||
response = self.client.post(
|
||||
reverse(
|
||||
'rest_api:document-ocr-submit-view',
|
||||
args=(self.document.pk,)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
def test_submit_document_version(self):
|
||||
response = self.client.post(
|
||||
reverse(
|
||||
'rest_api:document-version-ocr-submit-view',
|
||||
args=(self.document.latest_version.pk,)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
def test_get_document_version_page_content(self):
|
||||
response = self.client.get(
|
||||
reverse(
|
||||
'rest_api:document-page-content-view',
|
||||
args=(self.document.latest_version.pages.first().pk,)
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in json.loads(response.content)['content']
|
||||
)
|
||||
@@ -2,40 +2,38 @@ from __future__ import unicode_literals
|
||||
|
||||
from actstream.models import Action
|
||||
|
||||
from documents.tests.literals import TEST_DOCUMENT_FILENAME
|
||||
from documents.tests.test_models import GenericDocumentTestCase
|
||||
|
||||
from ..events import (
|
||||
event_ocr_document_version_submit, event_ocr_document_version_finish
|
||||
event_parsing_document_version_submit,
|
||||
event_parsing_document_version_finish
|
||||
)
|
||||
|
||||
|
||||
class OCREventsTestCase(GenericDocumentTestCase):
|
||||
class DocumentParsingEventsTestCase(GenericDocumentTestCase):
|
||||
# Ensure we use a PDF file
|
||||
test_document_filename = TEST_DOCUMENT_FILENAME
|
||||
|
||||
def test_document_version_submit_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.first().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.first().verb,
|
||||
event_ocr_document_version_submit.name
|
||||
)
|
||||
|
||||
def test_document_version_finish_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
from ..models import DocumentVersionOCRError, DocumentPageContent
|
||||
#print DocumentVersionOCRError.objects.all()
|
||||
print DocumentPageContent.objects.all()
|
||||
|
||||
for a in Action.objects.all():
|
||||
print a
|
||||
self.document.submit_for_parsing()
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.last().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.last().verb,
|
||||
event_ocr_document_version_finish.name
|
||||
event_parsing_document_version_submit.name
|
||||
)
|
||||
|
||||
def test_document_version_finish_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_parsing()
|
||||
self.assertEqual(
|
||||
Action.objects.first().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.first().verb,
|
||||
event_parsing_document_version_finish.name
|
||||
)
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.settings import setting_language_choices
|
||||
from documents.tests import (
|
||||
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRTestCase(BaseTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(DocumentOCRTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document.delete()
|
||||
self.document_type.delete()
|
||||
super(DocumentOCRTestCase, self).tearDown()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
|
||||
class GermanOCRSupportTestCase(BaseTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(GermanOCRSupportTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
# Get corresponding language code for German from the default language
|
||||
# choices list
|
||||
language_code = [
|
||||
language for language in setting_language_choices.value if language[1] == 'German'
|
||||
][0][0]
|
||||
|
||||
self.assertEqual('deu', language_code)
|
||||
|
||||
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object, language=language_code
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(GermanOCRSupportTestCase, self).tearDown()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue(
|
||||
'Repository für elektronische Dokumente.' in content
|
||||
)
|
||||
self.assertTrue(
|
||||
'Es bietet einen' in content
|
||||
)
|
||||
@@ -5,12 +5,9 @@ from django.test import override_settings
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import (
|
||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
||||
)
|
||||
from documents.tests import TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL
|
||||
|
||||
from ..classes import TextExtractor
|
||||
from ..parsers import PDFMinerParser, PopplerParser
|
||||
from ..parsers import PopplerParser
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
@@ -30,54 +27,11 @@ class ParserTestCase(BaseTestCase):
|
||||
self.document_type.delete()
|
||||
super(ParserTestCase, self).tearDown()
|
||||
|
||||
def test_pdfminer_parser(self):
|
||||
parser = PDFMinerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
def test_poppler_parser(self):
|
||||
parser = PopplerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class TextExtractorTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TextExtractorTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(TextExtractorTestCase, self).tearDown()
|
||||
|
||||
def test_text_extractor(self):
|
||||
TextExtractor.process_document_version(
|
||||
document_version=self.document.latest_version
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.first().ocr_content.content,
|
||||
'Sample text',
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.last().ocr_content.content,
|
||||
'Sample text in image form',
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().content.content
|
||||
)
|
||||
|
||||
@@ -1,26 +1,25 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.literals import TEST_DOCUMENT_FILENAME
|
||||
from documents.tests.test_views import GenericDocumentViewTestCase
|
||||
|
||||
from ..permissions import permission_ocr_content_view
|
||||
from ..utils import get_document_ocr_content
|
||||
from ..permissions import permission_content_view
|
||||
from ..utils import get_document_content
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=True)
|
||||
class OCRViewsTestCase(GenericDocumentViewTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
class DocumentContentViewsTestCase(GenericDocumentViewTestCase):
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
# Ensure we use a PDF file
|
||||
test_document_filename = TEST_DOCUMENT_FILENAME
|
||||
|
||||
def setUp(self):
|
||||
super(OCRViewsTestCase, self).setUp()
|
||||
super(DocumentContentViewsTestCase, self).setUp()
|
||||
self.login_user()
|
||||
|
||||
def _document_content_view(self):
|
||||
return self.get(
|
||||
'ocr:document_content', args=(self.document.pk,)
|
||||
'document_parsing:document_content', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
def test_document_content_view_no_permissions(self):
|
||||
@@ -29,7 +28,7 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
|
||||
self.assertEqual(response.status_code, 403)
|
||||
|
||||
def test_document_content_view_with_permission(self):
|
||||
self.grant_permission(permission=permission_ocr_content_view)
|
||||
self.grant_permission(permission=permission_content_view)
|
||||
|
||||
response = self._document_content_view()
|
||||
|
||||
@@ -37,25 +36,25 @@ class OCRViewsTestCase(GenericDocumentViewTestCase):
|
||||
response, 'Mayan EDMS Documentation', status_code=200
|
||||
)
|
||||
|
||||
def test_document_ocr_download_view_no_permission(self):
|
||||
def test_document_parsing_download_view_no_permission(self):
|
||||
response = self.get(
|
||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||
'document_parsing:document_content_download', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 403)
|
||||
|
||||
def test_document_download_view_with_permission(self):
|
||||
def test_download_view_with_permission(self):
|
||||
self.expected_content_type = 'application/octet-stream; charset=utf-8'
|
||||
|
||||
self.grant_permission(permission=permission_ocr_content_view)
|
||||
self.grant_permission(permission=permission_content_view)
|
||||
response = self.get(
|
||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||
'document_parsing:document_content_download', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
self.assert_download_response(
|
||||
response, content=(
|
||||
''.join(get_document_ocr_content(document=self.document))
|
||||
''.join(get_document_content(document=self.document))
|
||||
),
|
||||
)
|
||||
|
||||
@@ -2,62 +2,43 @@ from __future__ import unicode_literals
|
||||
|
||||
from django.conf.urls import url
|
||||
|
||||
from .api_views import (
|
||||
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
|
||||
)
|
||||
from .api_views import APIDocumentPageContentView
|
||||
from .views import (
|
||||
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
||||
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
|
||||
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
|
||||
DocumentContentView, DocumentContentDownloadView,
|
||||
DocumentParsingErrorsListView, DocumentSubmitView, DocumentTypeSubmitView,
|
||||
ParseErrorListView
|
||||
)
|
||||
|
||||
urlpatterns = [
|
||||
url(
|
||||
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
|
||||
r'^documents/(?P<pk>\d+)/content/$', DocumentContentView.as_view(),
|
||||
name='document_content'
|
||||
),
|
||||
url(
|
||||
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
|
||||
name='document_submit'
|
||||
r'^documents/(?P<pk>\d+)/content/download/$',
|
||||
DocumentContentDownloadView.as_view(), name='document_content_download'
|
||||
),
|
||||
url(
|
||||
r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
|
||||
name='document_submit_all'
|
||||
),
|
||||
url(
|
||||
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
|
||||
r'^document_types/submit/$', DocumentTypeSubmitView.as_view(),
|
||||
name='document_type_submit'
|
||||
),
|
||||
url(
|
||||
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
|
||||
r'^documents/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
|
||||
name='document_submit'
|
||||
),
|
||||
url(
|
||||
r'^documents/multiple/submit/$', DocumentSubmitView.as_view(),
|
||||
name='document_submit_multiple'
|
||||
),
|
||||
url(
|
||||
r'^document_type/(?P<pk>\d+)/ocr/settings/$',
|
||||
DocumentTypeSettingsEditView.as_view(),
|
||||
name='document_type_ocr_settings'
|
||||
r'^documents/(?P<pk>\d+)/errors/$',
|
||||
DocumentParsingErrorsListView.as_view(),
|
||||
name='document_parsing_error_list'
|
||||
),
|
||||
url(
|
||||
r'^documents/(?P<pk>\d+)/ocr/errors/$',
|
||||
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
|
||||
),
|
||||
url(
|
||||
r'^documents/(?P<pk>\d+)/ocr/download/$',
|
||||
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
|
||||
),
|
||||
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
|
||||
url(r'^errors/all/$', ParseErrorListView.as_view(), name='error_list'),
|
||||
]
|
||||
|
||||
api_urls = [
|
||||
url(
|
||||
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
|
||||
name='document-ocr-submit-view'
|
||||
),
|
||||
url(
|
||||
r'^document_version/(?P<pk>\d+)/submit/$',
|
||||
APIDocumentVersionOCRView.as_view(),
|
||||
name='document-version-ocr-submit-view'
|
||||
),
|
||||
url(
|
||||
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
||||
name='document-page-content-view'
|
||||
|
||||
@@ -6,10 +6,10 @@ from django.utils.html import conditional_escape
|
||||
from .models import DocumentPageContent
|
||||
|
||||
|
||||
def get_document_ocr_content(document):
|
||||
def get_document_content(document):
|
||||
for page in document.pages.all():
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
page_content = page.content.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
|
||||
@@ -4,137 +4,27 @@ from django.contrib import messages
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.shortcuts import get_object_or_404
|
||||
from django.urls import reverse
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext_lazy as _, ungettext
|
||||
|
||||
from acls.models import AccessControlList
|
||||
from common.generics import (
|
||||
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
|
||||
SingleObjectEditView, SingleObjectListView
|
||||
FormView, MultipleObjectConfirmActionView, SingleObjectDetailView,
|
||||
SingleObjectDownloadView, SingleObjectListView
|
||||
)
|
||||
from common.mixins import MultipleInstanceActionMixin
|
||||
from documents.models import Document, DocumentType
|
||||
from documents.models import Document
|
||||
|
||||
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import (
|
||||
permission_ocr_content_view, permission_ocr_document,
|
||||
permission_document_type_ocr_setup
|
||||
)
|
||||
from .utils import get_document_ocr_content
|
||||
from .models import DocumentVersionParseError
|
||||
from .permissions import permission_content_view, permission_parse_document
|
||||
from .utils import get_document_content
|
||||
|
||||
|
||||
class DocumentAllSubmitView(ConfirmView):
|
||||
extra_context = {'title': _('Submit all documents for OCR?')}
|
||||
|
||||
def get_post_action_redirect(self):
|
||||
return reverse('common:tools_list')
|
||||
|
||||
def view_action(self):
|
||||
count = 0
|
||||
for document in Document.objects.all():
|
||||
document.submit_for_ocr()
|
||||
count += 1
|
||||
|
||||
messages.success(
|
||||
self.request, _('%d documents added to the OCR queue.') % count
|
||||
)
|
||||
|
||||
|
||||
class DocumentSubmitView(ConfirmView):
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'object': self.get_object(),
|
||||
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
|
||||
}
|
||||
|
||||
def get_object(self):
|
||||
return Document.objects.get(pk=self.kwargs['pk'])
|
||||
|
||||
def object_action(self, instance):
|
||||
AccessControlList.objects.check_access(
|
||||
permissions=permission_ocr_document, user=self.request.user,
|
||||
obj=instance
|
||||
)
|
||||
|
||||
instance.submit_for_ocr()
|
||||
|
||||
def view_action(self):
|
||||
instance = self.get_object()
|
||||
|
||||
self.object_action(instance=instance)
|
||||
|
||||
messages.success(
|
||||
self.request,
|
||||
_('Document: %(document)s was added to the OCR queue.') % {
|
||||
'document': instance
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
|
||||
model = Document
|
||||
success_message = '%(count)d document submitted to the OCR queue.'
|
||||
success_message_plural = '%(count)d documents submitted to the OCR queue.'
|
||||
|
||||
def get_extra_context(self):
|
||||
# Override the base class method
|
||||
return {
|
||||
'title': _('Submit the selected documents to the OCR queue?')
|
||||
}
|
||||
|
||||
|
||||
class DocumentTypeSubmitView(FormView):
|
||||
form_class = DocumentTypeSelectForm
|
||||
extra_context = {
|
||||
'title': _('Submit all documents of a type for OCR')
|
||||
}
|
||||
|
||||
def get_post_action_redirect(self):
|
||||
return reverse('common:tools_list')
|
||||
|
||||
def form_valid(self, form):
|
||||
count = 0
|
||||
for document in form.cleaned_data['document_type'].documents.all():
|
||||
document.submit_for_ocr()
|
||||
count += 1
|
||||
|
||||
messages.success(
|
||||
self.request, _(
|
||||
'%(count)d documents of type "%(document_type)s" added to the '
|
||||
'OCR queue.'
|
||||
) % {
|
||||
'count': count,
|
||||
'document_type': form.cleaned_data['document_type']
|
||||
}
|
||||
)
|
||||
|
||||
return HttpResponseRedirect(self.get_success_url())
|
||||
|
||||
|
||||
class DocumentTypeSettingsEditView(SingleObjectEditView):
|
||||
fields = ('auto_ocr',)
|
||||
view_permission = permission_document_type_ocr_setup
|
||||
|
||||
def get_object(self, queryset=None):
|
||||
return get_object_or_404(
|
||||
DocumentType, pk=self.kwargs['pk']
|
||||
).ocr_settings
|
||||
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'title': _(
|
||||
'Edit OCR settings for document type: %s'
|
||||
) % self.get_object().document_type
|
||||
}
|
||||
|
||||
|
||||
class DocumentOCRContent(SingleObjectDetailView):
|
||||
class DocumentContentView(SingleObjectDetailView):
|
||||
form_class = DocumentContentForm
|
||||
model = Document
|
||||
object_permission = permission_ocr_content_view
|
||||
object_permission = permission_content_view
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
result = super(DocumentOCRContent, self).dispatch(
|
||||
result = super(DocumentContentView, self).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
self.get_object().add_as_recent_document_for_user(request.user)
|
||||
@@ -145,23 +35,25 @@ class DocumentOCRContent(SingleObjectDetailView):
|
||||
'document': self.get_object(),
|
||||
'hide_labels': True,
|
||||
'object': self.get_object(),
|
||||
'title': _('OCR result for document: %s') % self.get_object(),
|
||||
'title': _('Content for document: %s') % self.get_object(),
|
||||
}
|
||||
|
||||
|
||||
class EntryListView(SingleObjectListView):
|
||||
extra_context = {
|
||||
'hide_object': True,
|
||||
'title': _('OCR errors'),
|
||||
}
|
||||
view_permission = permission_ocr_document
|
||||
class DocumentContentDownloadView(SingleObjectDownloadView):
|
||||
model = Document
|
||||
object_permission = permission_content_view
|
||||
|
||||
def get_object_list(self):
|
||||
return DocumentVersionOCRError.objects.all()
|
||||
def get_file(self):
|
||||
file_object = DocumentContentDownloadView.TextIteratorIO(
|
||||
iterator=get_document_content(document=self.get_object())
|
||||
)
|
||||
return DocumentContentDownloadView.VirtualFile(
|
||||
file=file_object, name='{}-content'.format(self.get_object())
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRErrorsListView(SingleObjectListView):
|
||||
view_permission = permission_ocr_document
|
||||
class DocumentParsingErrorsListView(SingleObjectListView):
|
||||
view_permission = permission_content_view
|
||||
|
||||
def get_document(self):
|
||||
return get_object_or_404(Document, pk=self.kwargs['pk'])
|
||||
@@ -170,21 +62,93 @@ class DocumentOCRErrorsListView(SingleObjectListView):
|
||||
return {
|
||||
'hide_object': True,
|
||||
'object': self.get_document(),
|
||||
'title': _('OCR errors for document: %s') % self.get_document(),
|
||||
'title': _(
|
||||
'Parsing errors for document: %s'
|
||||
) % self.get_document(),
|
||||
}
|
||||
|
||||
def get_object_list(self):
|
||||
return self.get_document().latest_version.ocr_errors.all()
|
||||
return self.get_document().latest_version.parsing_errors.all()
|
||||
|
||||
|
||||
class DocumentOCRDownloadView(SingleObjectDownloadView):
|
||||
class DocumentSubmitView(MultipleObjectConfirmActionView):
|
||||
model = Document
|
||||
object_permission = permission_ocr_content_view
|
||||
object_permission = permission_parse_document
|
||||
success_message = _(
|
||||
'%(count)d document added to the parsing queue'
|
||||
)
|
||||
success_message_plural = _(
|
||||
'%(count)d documents added to the parsing queue'
|
||||
)
|
||||
|
||||
def get_file(self):
|
||||
file_object = DocumentOCRDownloadView.TextIteratorIO(
|
||||
iterator=get_document_ocr_content(document=self.get_object())
|
||||
def get_extra_context(self):
|
||||
queryset = self.get_queryset()
|
||||
|
||||
result = {
|
||||
'title': ungettext(
|
||||
singular='Submit %(count)d document to the parsing queue?',
|
||||
plural='Submit %(count)d documents to the parsing queue',
|
||||
number=queryset.count()
|
||||
) % {
|
||||
'count': queryset.count(),
|
||||
}
|
||||
}
|
||||
|
||||
if queryset.count() == 1:
|
||||
result.update(
|
||||
{
|
||||
'object': queryset.first(),
|
||||
'title': _(
|
||||
'Submit document "%s" to the parsing queue'
|
||||
) % queryset.first()
|
||||
}
|
||||
)
|
||||
return DocumentOCRDownloadView.VirtualFile(
|
||||
file=file_object, name='{}-OCR'.format(self.get_object())
|
||||
|
||||
return result
|
||||
|
||||
def object_action(self, instance, form=None):
|
||||
instance.submit_for_parsing()
|
||||
|
||||
|
||||
class DocumentTypeSubmitView(FormView):
|
||||
form_class = DocumentTypeSelectForm
|
||||
extra_context = {
|
||||
'title': _('Submit all documents of a type for parsing')
|
||||
}
|
||||
|
||||
def get_form_extra_kwargs(self):
|
||||
return {
|
||||
'user': self.request.user
|
||||
}
|
||||
|
||||
def get_post_action_redirect(self):
|
||||
return reverse('common:tools_list')
|
||||
|
||||
def form_valid(self, form):
|
||||
count = 0
|
||||
for document in form.cleaned_data['document_type'].documents.all():
|
||||
document.submit_for_parsing()
|
||||
count += 1
|
||||
|
||||
messages.success(
|
||||
self.request, _(
|
||||
'%(count)d documents of type "%(document_type)s" added to the '
|
||||
'parsing queue.'
|
||||
) % {
|
||||
'count': count,
|
||||
'document_type': form.cleaned_data['document_type']
|
||||
}
|
||||
)
|
||||
|
||||
return HttpResponseRedirect(self.get_success_url())
|
||||
|
||||
|
||||
class ParseErrorListView(SingleObjectListView):
|
||||
extra_context = {
|
||||
'hide_object': True,
|
||||
'title': _('Parsing errors'),
|
||||
}
|
||||
view_permission = permission_content_view
|
||||
|
||||
def get_object_list(self):
|
||||
return DocumentVersionParseError.objects.all()
|
||||
|
||||
21
mayan/apps/documents/migrations/0041_auto_20170823_1855.py
Normal file
21
mayan/apps/documents/migrations/0041_auto_20170823_1855.py
Normal file
@@ -0,0 +1,21 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.7 on 2017-08-23 18:55
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0040_auto_20170725_1111'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='document',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False),
|
||||
),
|
||||
]
|
||||
Reference in New Issue
Block a user