Initial commit of the document parsing app.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
3
mayan/apps/document_parsing/__init__.py
Normal file
3
mayan/apps/document_parsing/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
default_app_config = 'document_parsing.apps.DocumentParsingApp'
|
||||||
23
mayan/apps/document_parsing/admin.py
Normal file
23
mayan/apps/document_parsing/admin.py
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.contrib import admin
|
||||||
|
|
||||||
|
from .models import (
|
||||||
|
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(DocumentPageContent)
|
||||||
|
class DocumentPageContentAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ('document_page',)
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(DocumentTypeSettings)
|
||||||
|
class DocumentTypeSettingsAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ('document_type', 'auto_ocr')
|
||||||
|
|
||||||
|
|
||||||
|
@admin.register(DocumentVersionOCRError)
|
||||||
|
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
|
||||||
|
list_display = ('document_version', 'datetime_submitted')
|
||||||
|
readonly_fields = ('document_version', 'datetime_submitted', 'result')
|
||||||
97
mayan/apps/document_parsing/api_views.py
Normal file
97
mayan/apps/document_parsing/api_views.py
Normal file
@@ -0,0 +1,97 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
from rest_framework import generics, status
|
||||||
|
from rest_framework.response import Response
|
||||||
|
|
||||||
|
from documents.models import Document, DocumentPage, DocumentVersion
|
||||||
|
from rest_api.permissions import MayanPermission
|
||||||
|
|
||||||
|
from .models import DocumentPageContent
|
||||||
|
from .permissions import permission_ocr_content_view, permission_ocr_document
|
||||||
|
from .serializers import DocumentPageContentSerializer
|
||||||
|
|
||||||
|
|
||||||
|
class APIDocumentOCRView(generics.GenericAPIView):
|
||||||
|
mayan_object_permissions = {
|
||||||
|
'POST': (permission_ocr_document,)
|
||||||
|
}
|
||||||
|
permission_classes = (MayanPermission,)
|
||||||
|
queryset = Document.objects.all()
|
||||||
|
|
||||||
|
def get_serializer_class(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def post(self, request, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Submit a document for OCR.
|
||||||
|
---
|
||||||
|
omit_serializer: true
|
||||||
|
parameters:
|
||||||
|
- name: pk
|
||||||
|
paramType: path
|
||||||
|
type: number
|
||||||
|
responseMessages:
|
||||||
|
- code: 202
|
||||||
|
message: Accepted
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.get_object().submit_for_ocr()
|
||||||
|
return Response(status=status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
|
|
||||||
|
class APIDocumentVersionOCRView(generics.GenericAPIView):
|
||||||
|
mayan_object_permissions = {
|
||||||
|
'POST': (permission_ocr_document,)
|
||||||
|
}
|
||||||
|
permission_classes = (MayanPermission,)
|
||||||
|
queryset = DocumentVersion.objects.all()
|
||||||
|
|
||||||
|
def get_serializer_class(self):
|
||||||
|
return None
|
||||||
|
|
||||||
|
def post(self, request, *args, **kwargs):
|
||||||
|
"""
|
||||||
|
Submit a document version for OCR.
|
||||||
|
---
|
||||||
|
omit_serializer: true
|
||||||
|
parameters:
|
||||||
|
- name: pk
|
||||||
|
paramType: path
|
||||||
|
type: number
|
||||||
|
responseMessages:
|
||||||
|
- code: 202
|
||||||
|
message: Accepted
|
||||||
|
"""
|
||||||
|
|
||||||
|
self.get_object().submit_for_ocr()
|
||||||
|
return Response(status=status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
|
|
||||||
|
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||||
|
"""
|
||||||
|
Returns the OCR content of the selected document page.
|
||||||
|
---
|
||||||
|
GET:
|
||||||
|
parameters:
|
||||||
|
- name: pk
|
||||||
|
paramType: path
|
||||||
|
type: number
|
||||||
|
"""
|
||||||
|
|
||||||
|
mayan_object_permissions = {
|
||||||
|
'GET': (permission_ocr_content_view,),
|
||||||
|
}
|
||||||
|
permission_classes = (MayanPermission,)
|
||||||
|
serializer_class = DocumentPageContentSerializer
|
||||||
|
queryset = DocumentPage.objects.all()
|
||||||
|
|
||||||
|
def retrieve(self, request, *args, **kwargs):
|
||||||
|
instance = self.get_object()
|
||||||
|
|
||||||
|
try:
|
||||||
|
ocr_content = instance.ocr_content
|
||||||
|
except DocumentPageContent.DoesNotExist:
|
||||||
|
ocr_content = DocumentPageContent.objects.none()
|
||||||
|
|
||||||
|
serializer = self.get_serializer(ocr_content)
|
||||||
|
return Response(serializer.data)
|
||||||
125
mayan/apps/document_parsing/apps.py
Normal file
125
mayan/apps/document_parsing/apps.py
Normal file
@@ -0,0 +1,125 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from kombu import Exchange, Queue
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
|
from django.db.models.signals import post_save
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from acls import ModelPermission
|
||||||
|
from common import (
|
||||||
|
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
|
||||||
|
menu_tools
|
||||||
|
)
|
||||||
|
from common.settings import settings_db_sync_task_delay
|
||||||
|
from documents.search import document_search, document_page_search
|
||||||
|
from documents.signals import post_version_upload
|
||||||
|
from documents.widgets import document_link
|
||||||
|
from mayan.celery import app
|
||||||
|
from navigation import SourceColumn
|
||||||
|
from rest_api.classes import APIEndPoint
|
||||||
|
|
||||||
|
from .handlers import handler_parse_document_version
|
||||||
|
from .links import (
|
||||||
|
link_document_content, link_entry_list, link_document_content_errors_list,
|
||||||
|
link_document_content_download
|
||||||
|
)
|
||||||
|
from .permissions import permission_content_view
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentParsingApp(MayanAppConfig):
|
||||||
|
has_tests = True
|
||||||
|
name = 'document_parsing'
|
||||||
|
verbose_name = _('Document parsing')
|
||||||
|
|
||||||
|
def ready(self):
|
||||||
|
super(DocumentParsingApp, self).ready()
|
||||||
|
|
||||||
|
APIEndPoint(app=self, version_string='1')
|
||||||
|
|
||||||
|
Document = apps.get_model(
|
||||||
|
app_label='documents', model_name='Document'
|
||||||
|
)
|
||||||
|
|
||||||
|
DocumentType = apps.get_model(
|
||||||
|
app_label='documents', model_name='DocumentType'
|
||||||
|
)
|
||||||
|
|
||||||
|
DocumentVersion = apps.get_model(
|
||||||
|
app_label='documents', model_name='DocumentVersion'
|
||||||
|
)
|
||||||
|
|
||||||
|
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
|
||||||
|
|
||||||
|
ModelPermission.register(
|
||||||
|
model=Document, permissions=(permission_content_view,)
|
||||||
|
)
|
||||||
|
|
||||||
|
SourceColumn(
|
||||||
|
source=DocumentVersionParseError, label=_('Document'),
|
||||||
|
func=lambda context: document_link(context['object'].document_version.document)
|
||||||
|
)
|
||||||
|
SourceColumn(
|
||||||
|
source=DocumentVersionParseError, label=_('Added'),
|
||||||
|
attribute='datetime_submitted'
|
||||||
|
)
|
||||||
|
SourceColumn(
|
||||||
|
source=DocumentVersionParseError, label=_('Result'),
|
||||||
|
attribute='result'
|
||||||
|
)
|
||||||
|
|
||||||
|
document_search.add_model_field(
|
||||||
|
field='versions__pages__content__content', label=_('Content')
|
||||||
|
)
|
||||||
|
|
||||||
|
document_page_search.add_model_field(
|
||||||
|
field='content__content', label=_('Content')
|
||||||
|
)
|
||||||
|
|
||||||
|
menu_facet.bind_links(
|
||||||
|
links=(link_document_content,), sources=(Document,)
|
||||||
|
)
|
||||||
|
menu_multi_item.bind_links(
|
||||||
|
links=(link_document_submit_multiple,), sources=(Document,)
|
||||||
|
)
|
||||||
|
menu_object.bind_links(
|
||||||
|
links=(link_document_submit,), sources=(Document,)
|
||||||
|
)
|
||||||
|
menu_object.bind_links(
|
||||||
|
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
|
||||||
|
)
|
||||||
|
menu_secondary.bind_links(
|
||||||
|
links=(
|
||||||
|
link_document_content, link_document_ocr_erros_list,
|
||||||
|
link_document_ocr_download
|
||||||
|
),
|
||||||
|
sources=(
|
||||||
|
'document_parsing:document_content',
|
||||||
|
'document_parsing:document_ocr_error_list',
|
||||||
|
'document_parsing:document_ocr_download',
|
||||||
|
)
|
||||||
|
)
|
||||||
|
menu_secondary.bind_links(
|
||||||
|
links=(link_entry_list,),
|
||||||
|
sources=(
|
||||||
|
'document_parsing:entry_list',
|
||||||
|
'document_parsing:entry_delete_multiple',
|
||||||
|
'document_parsing:entry_re_queue_multiple',
|
||||||
|
DocumentVersionParseError
|
||||||
|
)
|
||||||
|
)
|
||||||
|
menu_tools.bind_links(
|
||||||
|
links=(
|
||||||
|
link_entry_list
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
post_version_upload.connect(
|
||||||
|
dispatch_uid='document_parsing_handler_parse_document_version',
|
||||||
|
receiver=handler_parse_document_version,
|
||||||
|
sender=DocumentVersion
|
||||||
|
)
|
||||||
22
mayan/apps/document_parsing/exceptions.py
Normal file
22
mayan/apps/document_parsing/exceptions.py
Normal file
@@ -0,0 +1,22 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
|
||||||
|
class OCRError(Exception):
|
||||||
|
"""
|
||||||
|
Raised by the OCR backend
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ParserError(Exception):
|
||||||
|
"""
|
||||||
|
Base exception for file parsers
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class NoMIMETypeMatch(ParserError):
|
||||||
|
"""
|
||||||
|
There is no parser registered for the specified MIME type
|
||||||
|
"""
|
||||||
|
pass
|
||||||
104
mayan/apps/document_parsing/forms.py
Normal file
104
mayan/apps/document_parsing/forms.py
Normal file
@@ -0,0 +1,104 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django import forms
|
||||||
|
from django.utils.encoding import force_text
|
||||||
|
from django.utils.html import conditional_escape
|
||||||
|
from django.utils.safestring import mark_safe
|
||||||
|
from django.utils.translation import ugettext_lazy as _, ugettext
|
||||||
|
|
||||||
|
from common.widgets import TextAreaDiv
|
||||||
|
from documents.models import DocumentType
|
||||||
|
|
||||||
|
from .models import DocumentPageContent, DocumentPageOCRContent
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentContentForm(forms.Form):
|
||||||
|
"""
|
||||||
|
Form that concatenates all of a document pages' text content into a
|
||||||
|
single textarea widget
|
||||||
|
"""
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.document = kwargs.pop('instance', None)
|
||||||
|
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
||||||
|
content = []
|
||||||
|
self.fields['contents'].initial = ''
|
||||||
|
try:
|
||||||
|
document_pages = self.document.pages.all()
|
||||||
|
except AttributeError:
|
||||||
|
document_pages = []
|
||||||
|
|
||||||
|
for page in document_pages:
|
||||||
|
try:
|
||||||
|
page_content = page.ocr_content.content
|
||||||
|
except DocumentPageContent.DoesNotExist:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
content.append(conditional_escape(force_text(page_content)))
|
||||||
|
content.append(
|
||||||
|
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
||||||
|
ugettext(
|
||||||
|
'Page %(page_number)d'
|
||||||
|
) % {'page_number': page.page_number}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.fields['contents'].initial = mark_safe(''.join(content))
|
||||||
|
|
||||||
|
contents = forms.CharField(
|
||||||
|
label=_('Contents'),
|
||||||
|
widget=TextAreaDiv(
|
||||||
|
attrs={
|
||||||
|
'class': 'text_area_div full-height',
|
||||||
|
'data-height-difference': 360
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentOCRContentForm(forms.Form):
|
||||||
|
"""
|
||||||
|
Form that concatenates all of a document pages' text content into a
|
||||||
|
single textarea widget
|
||||||
|
"""
|
||||||
|
def __init__(self, *args, **kwargs):
|
||||||
|
self.document = kwargs.pop('instance', None)
|
||||||
|
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
||||||
|
content = []
|
||||||
|
self.fields['contents'].initial = ''
|
||||||
|
try:
|
||||||
|
document_pages = self.document.pages.all()
|
||||||
|
except AttributeError:
|
||||||
|
document_pages = []
|
||||||
|
|
||||||
|
for page in document_pages:
|
||||||
|
try:
|
||||||
|
page_content = page.ocr_content.content
|
||||||
|
except DocumentPageOCRContent.DoesNotExist:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
content.append(conditional_escape(force_text(page_content)))
|
||||||
|
content.append(
|
||||||
|
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
||||||
|
ugettext(
|
||||||
|
'Page %(page_number)d'
|
||||||
|
) % {'page_number': page.page_number}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.fields['contents'].initial = mark_safe(''.join(content))
|
||||||
|
|
||||||
|
contents = forms.CharField(
|
||||||
|
label=_('Contents'),
|
||||||
|
widget=TextAreaDiv(
|
||||||
|
attrs={
|
||||||
|
'class': 'text_area_div full-height',
|
||||||
|
'data-height-difference': 360
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentTypeSelectForm(forms.Form):
|
||||||
|
document_type = forms.ModelChoiceField(
|
||||||
|
queryset=DocumentType.objects.all(), label=('Document type')
|
||||||
|
)
|
||||||
15
mayan/apps/document_parsing/handlers.py
Normal file
15
mayan/apps/document_parsing/handlers.py
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
|
|
||||||
|
from .settings import setting_auto_ocr
|
||||||
|
from .parsers import Parser
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
def handler_parse_document_version(sender, instance, **kwargs):
|
||||||
|
if kwargs['created']:
|
||||||
|
Parser.parse_document_version(document_version=instance)
|
||||||
27
mayan/apps/document_parsing/links.py
Normal file
27
mayan/apps/document_parsing/links.py
Normal file
@@ -0,0 +1,27 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from navigation import Link
|
||||||
|
|
||||||
|
from .permissions import permission_content_view
|
||||||
|
|
||||||
|
link_document_content = Link(
|
||||||
|
args='resolved_object.id', icon='fa fa-font',
|
||||||
|
permissions=(permission_content_view,), text=_('Content'),
|
||||||
|
view='document_parsing:document_content',
|
||||||
|
)
|
||||||
|
link_entry_list = Link(
|
||||||
|
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
|
||||||
|
text=_('Parsing errors'), view='document_parsing:entry_list'
|
||||||
|
)
|
||||||
|
link_document_content_errors_list = Link(
|
||||||
|
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||||
|
permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
|
||||||
|
view='document_parsing:document_page_parsing_error_list'
|
||||||
|
)
|
||||||
|
link_document_content_download = Link(
|
||||||
|
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||||
|
permissions=(permission_ocr_content_view,), text=_('Download content'),
|
||||||
|
view='document_parsing:document_content_download'
|
||||||
|
)
|
||||||
14
mayan/apps/document_parsing/managers.py
Normal file
14
mayan/apps/document_parsing/managers.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
|
import logging
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
|
from django.db import models
|
||||||
|
from django.utils.timezone import now
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentPageContentManager(models.Manager):
|
||||||
|
pass
|
||||||
47
mayan/apps/document_parsing/models.py
Normal file
47
mayan/apps/document_parsing/models.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import models
|
||||||
|
from django.utils.encoding import force_text, python_2_unicode_compatible
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
||||||
|
|
||||||
|
from .managers import DocumentPageContentManager
|
||||||
|
|
||||||
|
|
||||||
|
@python_2_unicode_compatible
|
||||||
|
class DocumentPageContent(models.Model):
|
||||||
|
document_page = models.OneToOneField(
|
||||||
|
DocumentPage, on_delete=models.CASCADE, related_name='content',
|
||||||
|
verbose_name=_('Document page')
|
||||||
|
)
|
||||||
|
content = models.TextField(blank=True, verbose_name=_('Content'))
|
||||||
|
|
||||||
|
objects = DocumentPageContentManager()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return force_text(self.document_page)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = _('Document page content')
|
||||||
|
verbose_name_plural = _('Document pages contents')
|
||||||
|
|
||||||
|
|
||||||
|
@python_2_unicode_compatible
|
||||||
|
class DocumentVersionParseError(models.Model):
|
||||||
|
document_version = models.ForeignKey(
|
||||||
|
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
|
||||||
|
verbose_name=_('Document version')
|
||||||
|
)
|
||||||
|
datetime_submitted = models.DateTimeField(
|
||||||
|
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
|
||||||
|
)
|
||||||
|
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return force_text(self.document_version)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
ordering = ('datetime_submitted',)
|
||||||
|
verbose_name = _('Document version parse error')
|
||||||
|
verbose_name_plural = _('Document version parse errors')
|
||||||
202
mayan/apps/document_parsing/parsers.py
Normal file
202
mayan/apps/document_parsing/parsers.py
Normal file
@@ -0,0 +1,202 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from io import BytesIO
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||||
|
from pdfminer.pdfpage import PDFPage
|
||||||
|
from pdfminer.converter import TextConverter
|
||||||
|
from pdfminer.layout import LAParams
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from common.utils import copyfile, fs_cleanup, mkstemp
|
||||||
|
|
||||||
|
from .exceptions import ParserError, NoMIMETypeMatch
|
||||||
|
from .models import DocumentPageContent
|
||||||
|
from .settings import setting_pdftotext_path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class Parser(object):
|
||||||
|
"""
|
||||||
|
Parser base class
|
||||||
|
"""
|
||||||
|
|
||||||
|
_registry = {}
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def register(cls, mimetypes, parser_classes):
|
||||||
|
for mimetype in mimetypes:
|
||||||
|
for parser_class in parser_classes:
|
||||||
|
cls._registry.setdefault(
|
||||||
|
mimetype, []
|
||||||
|
).append(parser_class)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse_document_version(cls, document_version):
|
||||||
|
try:
|
||||||
|
for parser_class in cls._registry[document_version.mimetype]:
|
||||||
|
try:
|
||||||
|
parser = parser_class()
|
||||||
|
parser.process_document_version(document_version)
|
||||||
|
except ParserError:
|
||||||
|
# If parser raises error, try next parser in the list
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# If parser was successfull there is no need to try
|
||||||
|
# others in the list for this mimetype
|
||||||
|
return
|
||||||
|
|
||||||
|
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||||
|
except KeyError:
|
||||||
|
raise NoMIMETypeMatch
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def parse_document_page(cls, document_page):
|
||||||
|
try:
|
||||||
|
for parser_class in cls._registry[document_page.document_version.mimetype]:
|
||||||
|
try:
|
||||||
|
parser = parser_class()
|
||||||
|
parser.process_document_page(document_page)
|
||||||
|
except ParserError:
|
||||||
|
# If parser raises error, try next parser in the list
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
# If parser was successfull there is no need to try
|
||||||
|
# others in the list for this mimetype
|
||||||
|
return
|
||||||
|
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||||
|
except KeyError:
|
||||||
|
raise NoMIMETypeMatch
|
||||||
|
|
||||||
|
def process_document_version(self, document_version):
|
||||||
|
logger.info(
|
||||||
|
'Starting parsing for document version: %s', document_version
|
||||||
|
)
|
||||||
|
logger.debug('document version: %d', document_version.pk)
|
||||||
|
|
||||||
|
for document_page in document_version.pages.all():
|
||||||
|
self.process_document_page(document_page=document_page)
|
||||||
|
|
||||||
|
def process_document_page(self, document_page):
|
||||||
|
logger.info(
|
||||||
|
'Processing page: %d of document version: %s',
|
||||||
|
document_page.page_number, document_page.document_version
|
||||||
|
)
|
||||||
|
|
||||||
|
file_object = document_page.document_version.get_intermidiate_file()
|
||||||
|
|
||||||
|
try:
|
||||||
|
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||||
|
document_page=document_page
|
||||||
|
)
|
||||||
|
document_page_content.content = self.execute(
|
||||||
|
file_object=file_object, page_number=document_page.page_number
|
||||||
|
)
|
||||||
|
document_page_content.save()
|
||||||
|
except Exception as exception:
|
||||||
|
error_message = _('Exception parsing page; %s') % exception
|
||||||
|
logger.error(error_message)
|
||||||
|
raise ParserError(error_message)
|
||||||
|
finally:
|
||||||
|
file_object.close()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
'Finished processing page: %d of document version: %s',
|
||||||
|
document_page.page_number, document_page.document_version
|
||||||
|
)
|
||||||
|
|
||||||
|
def execute(self, file_object, page_number):
|
||||||
|
raise NotImplementedError(
|
||||||
|
'Your %s class has not defined the required execute() method.' %
|
||||||
|
self.__class__.__name__
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PopplerParser(Parser):
|
||||||
|
"""
|
||||||
|
PDF parser using the pdftotext execute from the poppler package
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.pdftotext_path = setting_pdftotext_path.value
|
||||||
|
if not os.path.exists(self.pdftotext_path):
|
||||||
|
error_message = _(
|
||||||
|
'Cannot find pdftotext executable at: %s'
|
||||||
|
) % self.pdftotext_path
|
||||||
|
logger.error(error_message)
|
||||||
|
raise ParserError(error_message)
|
||||||
|
|
||||||
|
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
||||||
|
|
||||||
|
def execute(self, file_object, page_number):
|
||||||
|
logger.debug('Parsing PDF page: %d', page_number)
|
||||||
|
|
||||||
|
destination_descriptor, temp_filepath = mkstemp()
|
||||||
|
copyfile(file_object, temp_filepath)
|
||||||
|
|
||||||
|
command = []
|
||||||
|
command.append(self.pdftotext_path)
|
||||||
|
command.append('-f')
|
||||||
|
command.append(str(page_number))
|
||||||
|
command.append('-l')
|
||||||
|
command.append(str(page_number))
|
||||||
|
command.append(temp_filepath)
|
||||||
|
command.append('-')
|
||||||
|
|
||||||
|
proc = subprocess.Popen(
|
||||||
|
command, close_fds=True, stderr=subprocess.PIPE,
|
||||||
|
stdout=subprocess.PIPE
|
||||||
|
)
|
||||||
|
return_code = proc.wait()
|
||||||
|
if return_code != 0:
|
||||||
|
logger.error(proc.stderr.readline())
|
||||||
|
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
||||||
|
|
||||||
|
raise ParserError
|
||||||
|
|
||||||
|
output = proc.stdout.read()
|
||||||
|
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
||||||
|
|
||||||
|
if output == b'\x0c':
|
||||||
|
logger.debug('Parser didn\'t return any output')
|
||||||
|
return ''
|
||||||
|
|
||||||
|
if output[-3:] == b'\x0a\x0a\x0c':
|
||||||
|
return output[:-3]
|
||||||
|
|
||||||
|
return output
|
||||||
|
|
||||||
|
|
||||||
|
class PDFMinerParser(Parser):
|
||||||
|
"""
|
||||||
|
Parser for PDF files using the PDFMiner library for Python
|
||||||
|
"""
|
||||||
|
|
||||||
|
def execute(self, file_object, page_number):
|
||||||
|
logger.debug('Parsing PDF page: %d', page_number)
|
||||||
|
|
||||||
|
with BytesIO() as string_buffer:
|
||||||
|
rsrcmgr = PDFResourceManager()
|
||||||
|
device = TextConverter(
|
||||||
|
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
||||||
|
)
|
||||||
|
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||||
|
page = PDFPage.get_pages(
|
||||||
|
file_object, maxpages=1, pagenos=(page_number - 1,)
|
||||||
|
)
|
||||||
|
interpreter.process_page(page.next())
|
||||||
|
device.close()
|
||||||
|
|
||||||
|
logger.debug('Finished parsing PDF: %d', page_number)
|
||||||
|
|
||||||
|
return string_buffer.getvalue()
|
||||||
|
|
||||||
|
|
||||||
|
Parser.register(
|
||||||
|
mimetypes=('application/pdf',),
|
||||||
|
parser_classes=(PopplerParser, PDFMinerParser)
|
||||||
|
)
|
||||||
11
mayan/apps/document_parsing/permissions.py
Normal file
11
mayan/apps/document_parsing/permissions.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from permissions import PermissionNamespace
|
||||||
|
|
||||||
|
namespace = PermissionNamespace('document_parsing', _('Document parsing'))
|
||||||
|
|
||||||
|
permission_content_view = namespace.add_permission(
|
||||||
|
name='content_view', label=_('View the content of a document')
|
||||||
|
)
|
||||||
10
mayan/apps/document_parsing/queues.py
Normal file
10
mayan/apps/document_parsing/queues.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from task_manager.classes import CeleryQueue
|
||||||
|
|
||||||
|
queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
|
||||||
|
queue_ocr.add_task_type(
|
||||||
|
name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
|
||||||
|
)
|
||||||
11
mayan/apps/document_parsing/serializers.py
Normal file
11
mayan/apps/document_parsing/serializers.py
Normal file
@@ -0,0 +1,11 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from rest_framework import serializers
|
||||||
|
|
||||||
|
from .models import DocumentPageContent
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentPageContentSerializer(serializers.ModelSerializer):
|
||||||
|
class Meta:
|
||||||
|
fields = ('content',)
|
||||||
|
model = DocumentPageContent
|
||||||
17
mayan/apps/document_parsing/settings.py
Normal file
17
mayan/apps/document_parsing/settings.py
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from smart_settings import Namespace
|
||||||
|
|
||||||
|
namespace = Namespace(name='document_parsing', label=_('Document parsing'))
|
||||||
|
|
||||||
|
setting_pdftotext_path = namespace.add_setting(
|
||||||
|
global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH',
|
||||||
|
default='/usr/bin/pdftotext',
|
||||||
|
help_text=_(
|
||||||
|
'File path to poppler\'s pdftotext program used to extract text '
|
||||||
|
'from PDF files.'
|
||||||
|
),
|
||||||
|
is_path=True
|
||||||
|
)
|
||||||
0
mayan/apps/document_parsing/tests/__init__.py
Normal file
0
mayan/apps/document_parsing/tests/__init__.py
Normal file
88
mayan/apps/document_parsing/tests/test_api.py
Normal file
88
mayan/apps/document_parsing/tests/test_api.py
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import json
|
||||||
|
|
||||||
|
from django.contrib.auth import get_user_model
|
||||||
|
from django.urls import reverse
|
||||||
|
|
||||||
|
from rest_framework import status
|
||||||
|
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||||
|
from rest_api.tests import BaseAPITestCase
|
||||||
|
from user_management.tests import (
|
||||||
|
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OCRAPITestCase(BaseAPITestCase):
|
||||||
|
"""
|
||||||
|
Test the OCR app API endpoints
|
||||||
|
"""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(OCRAPITestCase, self).setUp()
|
||||||
|
|
||||||
|
self.admin_user = get_user_model().objects.create_superuser(
|
||||||
|
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
|
||||||
|
password=TEST_ADMIN_PASSWORD
|
||||||
|
)
|
||||||
|
|
||||||
|
self.client.login(
|
||||||
|
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
|
||||||
|
)
|
||||||
|
|
||||||
|
self.document_type = DocumentType.objects.create(
|
||||||
|
label=TEST_DOCUMENT_TYPE_LABEL
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||||
|
self.document = self.document_type.new_document(
|
||||||
|
file_object=file_object,
|
||||||
|
)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.document_type.delete()
|
||||||
|
super(OCRAPITestCase, self).tearDown()
|
||||||
|
|
||||||
|
def test_submit_document(self):
|
||||||
|
response = self.client.post(
|
||||||
|
reverse(
|
||||||
|
'rest_api:document-ocr-submit-view',
|
||||||
|
args=(self.document.pk,)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
|
content = self.document.pages.first().ocr_content.content
|
||||||
|
|
||||||
|
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||||
|
|
||||||
|
def test_submit_document_version(self):
|
||||||
|
response = self.client.post(
|
||||||
|
reverse(
|
||||||
|
'rest_api:document-version-ocr-submit-view',
|
||||||
|
args=(self.document.latest_version.pk,)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
|
content = self.document.pages.first().ocr_content.content
|
||||||
|
|
||||||
|
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||||
|
|
||||||
|
def test_get_document_version_page_content(self):
|
||||||
|
response = self.client.get(
|
||||||
|
reverse(
|
||||||
|
'rest_api:document-page-content-view',
|
||||||
|
args=(self.document.latest_version.pages.first().pk,)
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
'Mayan EDMS Documentation' in json.loads(response.content)['content']
|
||||||
|
)
|
||||||
41
mayan/apps/document_parsing/tests/test_events.py
Normal file
41
mayan/apps/document_parsing/tests/test_events.py
Normal file
@@ -0,0 +1,41 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from actstream.models import Action
|
||||||
|
|
||||||
|
from documents.tests.test_models import GenericDocumentTestCase
|
||||||
|
|
||||||
|
from ..events import (
|
||||||
|
event_ocr_document_version_submit, event_ocr_document_version_finish
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OCREventsTestCase(GenericDocumentTestCase):
|
||||||
|
def test_document_version_submit_event(self):
|
||||||
|
Action.objects.all().delete()
|
||||||
|
self.document.submit_for_ocr()
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.first().target, self.document.latest_version
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.first().verb,
|
||||||
|
event_ocr_document_version_submit.name
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_document_version_finish_event(self):
|
||||||
|
Action.objects.all().delete()
|
||||||
|
self.document.submit_for_ocr()
|
||||||
|
from ..models import DocumentVersionOCRError, DocumentPageContent
|
||||||
|
#print DocumentVersionOCRError.objects.all()
|
||||||
|
print DocumentPageContent.objects.all()
|
||||||
|
|
||||||
|
for a in Action.objects.all():
|
||||||
|
print a
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.last().target, self.document.latest_version
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.last().verb,
|
||||||
|
event_ocr_document_version_finish.name
|
||||||
|
)
|
||||||
77
mayan/apps/document_parsing/tests/test_models.py
Normal file
77
mayan/apps/document_parsing/tests/test_models.py
Normal file
@@ -0,0 +1,77 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from common.tests import BaseTestCase
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.settings import setting_language_choices
|
||||||
|
from documents.tests import (
|
||||||
|
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentOCRTestCase(BaseTestCase):
|
||||||
|
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||||
|
# Disable descriptor leak test until fixed in upstream
|
||||||
|
_skip_file_descriptor_test = True
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(DocumentOCRTestCase, self).setUp()
|
||||||
|
|
||||||
|
self.document_type = DocumentType.objects.create(
|
||||||
|
label=TEST_DOCUMENT_TYPE_LABEL
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||||
|
self.document = self.document_type.new_document(
|
||||||
|
file_object=file_object,
|
||||||
|
)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.document.delete()
|
||||||
|
self.document_type.delete()
|
||||||
|
super(DocumentOCRTestCase, self).tearDown()
|
||||||
|
|
||||||
|
def test_ocr_language_backends_end(self):
|
||||||
|
content = self.document.pages.first().ocr_content.content
|
||||||
|
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||||
|
|
||||||
|
|
||||||
|
class GermanOCRSupportTestCase(BaseTestCase):
|
||||||
|
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||||
|
# Disable descriptor leak test until fixed in upstream
|
||||||
|
_skip_file_descriptor_test = True
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(GermanOCRSupportTestCase, self).setUp()
|
||||||
|
|
||||||
|
self.document_type = DocumentType.objects.create(
|
||||||
|
label=TEST_DOCUMENT_TYPE_LABEL
|
||||||
|
)
|
||||||
|
|
||||||
|
# Get corresponding language code for German from the default language
|
||||||
|
# choices list
|
||||||
|
language_code = [
|
||||||
|
language for language in setting_language_choices.value if language[1] == 'German'
|
||||||
|
][0][0]
|
||||||
|
|
||||||
|
self.assertEqual('deu', language_code)
|
||||||
|
|
||||||
|
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
|
||||||
|
self.document = self.document_type.new_document(
|
||||||
|
file_object=file_object, language=language_code
|
||||||
|
)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.document_type.delete()
|
||||||
|
super(GermanOCRSupportTestCase, self).tearDown()
|
||||||
|
|
||||||
|
def test_ocr_language_backends_end(self):
|
||||||
|
content = self.document.pages.first().ocr_content.content
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
'Repository für elektronische Dokumente.' in content
|
||||||
|
)
|
||||||
|
self.assertTrue(
|
||||||
|
'Es bietet einen' in content
|
||||||
|
)
|
||||||
83
mayan/apps/document_parsing/tests/test_parsers.py
Normal file
83
mayan/apps/document_parsing/tests/test_parsers.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.core.files.base import File
|
||||||
|
from django.test import override_settings
|
||||||
|
|
||||||
|
from common.tests import BaseTestCase
|
||||||
|
from documents.models import DocumentType
|
||||||
|
from documents.tests import (
|
||||||
|
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
||||||
|
)
|
||||||
|
|
||||||
|
from ..classes import TextExtractor
|
||||||
|
from ..parsers import PDFMinerParser, PopplerParser
|
||||||
|
|
||||||
|
|
||||||
|
@override_settings(OCR_AUTO_OCR=False)
|
||||||
|
class ParserTestCase(BaseTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(ParserTestCase, self).setUp()
|
||||||
|
self.document_type = DocumentType.objects.create(
|
||||||
|
label=TEST_DOCUMENT_TYPE_LABEL
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(TEST_DOCUMENT_PATH) as file_object:
|
||||||
|
self.document = self.document_type.new_document(
|
||||||
|
file_object=File(file_object)
|
||||||
|
)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.document_type.delete()
|
||||||
|
super(ParserTestCase, self).tearDown()
|
||||||
|
|
||||||
|
def test_pdfminer_parser(self):
|
||||||
|
parser = PDFMinerParser()
|
||||||
|
|
||||||
|
parser.process_document_version(self.document.latest_version)
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_poppler_parser(self):
|
||||||
|
parser = PopplerParser()
|
||||||
|
|
||||||
|
parser.process_document_version(self.document.latest_version)
|
||||||
|
|
||||||
|
self.assertTrue(
|
||||||
|
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@override_settings(OCR_AUTO_OCR=False)
|
||||||
|
class TextExtractorTestCase(BaseTestCase):
|
||||||
|
def setUp(self):
|
||||||
|
super(TextExtractorTestCase, self).setUp()
|
||||||
|
|
||||||
|
self.document_type = DocumentType.objects.create(
|
||||||
|
label=TEST_DOCUMENT_TYPE_LABEL
|
||||||
|
)
|
||||||
|
|
||||||
|
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
||||||
|
self.document = self.document_type.new_document(
|
||||||
|
file_object=File(file_object)
|
||||||
|
)
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
self.document_type.delete()
|
||||||
|
super(TextExtractorTestCase, self).tearDown()
|
||||||
|
|
||||||
|
def test_text_extractor(self):
|
||||||
|
TextExtractor.process_document_version(
|
||||||
|
document_version=self.document.latest_version
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.document.latest_version.pages.first().ocr_content.content,
|
||||||
|
'Sample text',
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
self.document.latest_version.pages.last().ocr_content.content,
|
||||||
|
'Sample text in image form',
|
||||||
|
)
|
||||||
61
mayan/apps/document_parsing/tests/test_views.py
Normal file
61
mayan/apps/document_parsing/tests/test_views.py
Normal file
@@ -0,0 +1,61 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.test import override_settings
|
||||||
|
|
||||||
|
from documents.tests.test_views import GenericDocumentViewTestCase
|
||||||
|
|
||||||
|
from ..permissions import permission_ocr_content_view
|
||||||
|
from ..utils import get_document_ocr_content
|
||||||
|
|
||||||
|
|
||||||
|
@override_settings(OCR_AUTO_OCR=True)
|
||||||
|
class OCRViewsTestCase(GenericDocumentViewTestCase):
|
||||||
|
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||||
|
# Disable descriptor leak test until fixed in upstream
|
||||||
|
_skip_file_descriptor_test = True
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
super(OCRViewsTestCase, self).setUp()
|
||||||
|
self.login_user()
|
||||||
|
|
||||||
|
def _document_content_view(self):
|
||||||
|
return self.get(
|
||||||
|
'ocr:document_content', args=(self.document.pk,)
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_document_content_view_no_permissions(self):
|
||||||
|
response = self._document_content_view()
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 403)
|
||||||
|
|
||||||
|
def test_document_content_view_with_permission(self):
|
||||||
|
self.grant_permission(permission=permission_ocr_content_view)
|
||||||
|
|
||||||
|
response = self._document_content_view()
|
||||||
|
|
||||||
|
self.assertContains(
|
||||||
|
response, 'Mayan EDMS Documentation', status_code=200
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_document_ocr_download_view_no_permission(self):
|
||||||
|
response = self.get(
|
||||||
|
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 403)
|
||||||
|
|
||||||
|
def test_document_download_view_with_permission(self):
|
||||||
|
self.expected_content_type = 'application/octet-stream; charset=utf-8'
|
||||||
|
|
||||||
|
self.grant_permission(permission=permission_ocr_content_view)
|
||||||
|
response = self.get(
|
||||||
|
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||||
|
)
|
||||||
|
|
||||||
|
self.assertEqual(response.status_code, 200)
|
||||||
|
|
||||||
|
self.assert_download_response(
|
||||||
|
response, content=(
|
||||||
|
''.join(get_document_ocr_content(document=self.document))
|
||||||
|
),
|
||||||
|
)
|
||||||
65
mayan/apps/document_parsing/urls.py
Normal file
65
mayan/apps/document_parsing/urls.py
Normal file
@@ -0,0 +1,65 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.conf.urls import url
|
||||||
|
|
||||||
|
from .api_views import (
|
||||||
|
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
|
||||||
|
)
|
||||||
|
from .views import (
|
||||||
|
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
||||||
|
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
|
||||||
|
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
|
||||||
|
)
|
||||||
|
|
||||||
|
urlpatterns = [
|
||||||
|
url(
|
||||||
|
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
|
||||||
|
name='document_content'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
|
||||||
|
name='document_submit'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
|
||||||
|
name='document_submit_all'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
|
||||||
|
name='document_type_submit'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
|
||||||
|
name='document_submit_multiple'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^document_type/(?P<pk>\d+)/ocr/settings/$',
|
||||||
|
DocumentTypeSettingsEditView.as_view(),
|
||||||
|
name='document_type_ocr_settings'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^documents/(?P<pk>\d+)/ocr/errors/$',
|
||||||
|
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^documents/(?P<pk>\d+)/ocr/download/$',
|
||||||
|
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
|
||||||
|
),
|
||||||
|
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
|
||||||
|
]
|
||||||
|
|
||||||
|
api_urls = [
|
||||||
|
url(
|
||||||
|
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
|
||||||
|
name='document-ocr-submit-view'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^document_version/(?P<pk>\d+)/submit/$',
|
||||||
|
APIDocumentVersionOCRView.as_view(),
|
||||||
|
name='document-version-ocr-submit-view'
|
||||||
|
),
|
||||||
|
url(
|
||||||
|
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
||||||
|
name='document-page-content-view'
|
||||||
|
),
|
||||||
|
]
|
||||||
16
mayan/apps/document_parsing/utils.py
Normal file
16
mayan/apps/document_parsing/utils.py
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.utils.encoding import force_text
|
||||||
|
from django.utils.html import conditional_escape
|
||||||
|
|
||||||
|
from .models import DocumentPageContent
|
||||||
|
|
||||||
|
|
||||||
|
def get_document_ocr_content(document):
|
||||||
|
for page in document.pages.all():
|
||||||
|
try:
|
||||||
|
page_content = page.ocr_content.content
|
||||||
|
except DocumentPageContent.DoesNotExist:
|
||||||
|
pass
|
||||||
|
else:
|
||||||
|
yield conditional_escape(force_text(page_content))
|
||||||
190
mayan/apps/document_parsing/views.py
Normal file
190
mayan/apps/document_parsing/views.py
Normal file
@@ -0,0 +1,190 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
from django.contrib import messages
|
||||||
|
from django.http import HttpResponseRedirect
|
||||||
|
from django.shortcuts import get_object_or_404
|
||||||
|
from django.urls import reverse
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from acls.models import AccessControlList
|
||||||
|
from common.generics import (
|
||||||
|
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
|
||||||
|
SingleObjectEditView, SingleObjectListView
|
||||||
|
)
|
||||||
|
from common.mixins import MultipleInstanceActionMixin
|
||||||
|
from documents.models import Document, DocumentType
|
||||||
|
|
||||||
|
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
||||||
|
from .models import DocumentVersionOCRError
|
||||||
|
from .permissions import (
|
||||||
|
permission_ocr_content_view, permission_ocr_document,
|
||||||
|
permission_document_type_ocr_setup
|
||||||
|
)
|
||||||
|
from .utils import get_document_ocr_content
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentAllSubmitView(ConfirmView):
|
||||||
|
extra_context = {'title': _('Submit all documents for OCR?')}
|
||||||
|
|
||||||
|
def get_post_action_redirect(self):
|
||||||
|
return reverse('common:tools_list')
|
||||||
|
|
||||||
|
def view_action(self):
|
||||||
|
count = 0
|
||||||
|
for document in Document.objects.all():
|
||||||
|
document.submit_for_ocr()
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
messages.success(
|
||||||
|
self.request, _('%d documents added to the OCR queue.') % count
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentSubmitView(ConfirmView):
|
||||||
|
def get_extra_context(self):
|
||||||
|
return {
|
||||||
|
'object': self.get_object(),
|
||||||
|
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_object(self):
|
||||||
|
return Document.objects.get(pk=self.kwargs['pk'])
|
||||||
|
|
||||||
|
def object_action(self, instance):
|
||||||
|
AccessControlList.objects.check_access(
|
||||||
|
permissions=permission_ocr_document, user=self.request.user,
|
||||||
|
obj=instance
|
||||||
|
)
|
||||||
|
|
||||||
|
instance.submit_for_ocr()
|
||||||
|
|
||||||
|
def view_action(self):
|
||||||
|
instance = self.get_object()
|
||||||
|
|
||||||
|
self.object_action(instance=instance)
|
||||||
|
|
||||||
|
messages.success(
|
||||||
|
self.request,
|
||||||
|
_('Document: %(document)s was added to the OCR queue.') % {
|
||||||
|
'document': instance
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
|
||||||
|
model = Document
|
||||||
|
success_message = '%(count)d document submitted to the OCR queue.'
|
||||||
|
success_message_plural = '%(count)d documents submitted to the OCR queue.'
|
||||||
|
|
||||||
|
def get_extra_context(self):
|
||||||
|
# Override the base class method
|
||||||
|
return {
|
||||||
|
'title': _('Submit the selected documents to the OCR queue?')
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentTypeSubmitView(FormView):
|
||||||
|
form_class = DocumentTypeSelectForm
|
||||||
|
extra_context = {
|
||||||
|
'title': _('Submit all documents of a type for OCR')
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_post_action_redirect(self):
|
||||||
|
return reverse('common:tools_list')
|
||||||
|
|
||||||
|
def form_valid(self, form):
|
||||||
|
count = 0
|
||||||
|
for document in form.cleaned_data['document_type'].documents.all():
|
||||||
|
document.submit_for_ocr()
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
messages.success(
|
||||||
|
self.request, _(
|
||||||
|
'%(count)d documents of type "%(document_type)s" added to the '
|
||||||
|
'OCR queue.'
|
||||||
|
) % {
|
||||||
|
'count': count,
|
||||||
|
'document_type': form.cleaned_data['document_type']
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return HttpResponseRedirect(self.get_success_url())
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentTypeSettingsEditView(SingleObjectEditView):
|
||||||
|
fields = ('auto_ocr',)
|
||||||
|
view_permission = permission_document_type_ocr_setup
|
||||||
|
|
||||||
|
def get_object(self, queryset=None):
|
||||||
|
return get_object_or_404(
|
||||||
|
DocumentType, pk=self.kwargs['pk']
|
||||||
|
).ocr_settings
|
||||||
|
|
||||||
|
def get_extra_context(self):
|
||||||
|
return {
|
||||||
|
'title': _(
|
||||||
|
'Edit OCR settings for document type: %s'
|
||||||
|
) % self.get_object().document_type
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentOCRContent(SingleObjectDetailView):
|
||||||
|
form_class = DocumentContentForm
|
||||||
|
model = Document
|
||||||
|
object_permission = permission_ocr_content_view
|
||||||
|
|
||||||
|
def dispatch(self, request, *args, **kwargs):
|
||||||
|
result = super(DocumentOCRContent, self).dispatch(
|
||||||
|
request, *args, **kwargs
|
||||||
|
)
|
||||||
|
self.get_object().add_as_recent_document_for_user(request.user)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_extra_context(self):
|
||||||
|
return {
|
||||||
|
'document': self.get_object(),
|
||||||
|
'hide_labels': True,
|
||||||
|
'object': self.get_object(),
|
||||||
|
'title': _('OCR result for document: %s') % self.get_object(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class EntryListView(SingleObjectListView):
|
||||||
|
extra_context = {
|
||||||
|
'hide_object': True,
|
||||||
|
'title': _('OCR errors'),
|
||||||
|
}
|
||||||
|
view_permission = permission_ocr_document
|
||||||
|
|
||||||
|
def get_object_list(self):
|
||||||
|
return DocumentVersionOCRError.objects.all()
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentOCRErrorsListView(SingleObjectListView):
|
||||||
|
view_permission = permission_ocr_document
|
||||||
|
|
||||||
|
def get_document(self):
|
||||||
|
return get_object_or_404(Document, pk=self.kwargs['pk'])
|
||||||
|
|
||||||
|
def get_extra_context(self):
|
||||||
|
return {
|
||||||
|
'hide_object': True,
|
||||||
|
'object': self.get_document(),
|
||||||
|
'title': _('OCR errors for document: %s') % self.get_document(),
|
||||||
|
}
|
||||||
|
|
||||||
|
def get_object_list(self):
|
||||||
|
return self.get_document().latest_version.ocr_errors.all()
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentOCRDownloadView(SingleObjectDownloadView):
|
||||||
|
model = Document
|
||||||
|
object_permission = permission_ocr_content_view
|
||||||
|
|
||||||
|
def get_file(self):
|
||||||
|
file_object = DocumentOCRDownloadView.TextIteratorIO(
|
||||||
|
iterator=get_document_ocr_content(document=self.get_object())
|
||||||
|
)
|
||||||
|
return DocumentOCRDownloadView.VirtualFile(
|
||||||
|
file=file_object, name='{}-OCR'.format(self.get_object())
|
||||||
|
)
|
||||||
@@ -84,6 +84,7 @@ INSTALLED_APPS = (
|
|||||||
'checkouts',
|
'checkouts',
|
||||||
'document_comments',
|
'document_comments',
|
||||||
'document_indexing',
|
'document_indexing',
|
||||||
|
'document_parsing',
|
||||||
'document_signatures',
|
'document_signatures',
|
||||||
'document_states',
|
'document_states',
|
||||||
'documents',
|
'documents',
|
||||||
|
|||||||
Reference in New Issue
Block a user