Initial commit of the document parsing app.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
3
mayan/apps/document_parsing/__init__.py
Normal file
3
mayan/apps/document_parsing/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
default_app_config = 'document_parsing.apps.DocumentParsingApp'
|
||||
23
mayan/apps/document_parsing/admin.py
Normal file
23
mayan/apps/document_parsing/admin.py
Normal file
@@ -0,0 +1,23 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import (
|
||||
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
|
||||
)
|
||||
|
||||
|
||||
@admin.register(DocumentPageContent)
|
||||
class DocumentPageContentAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_page',)
|
||||
|
||||
|
||||
@admin.register(DocumentTypeSettings)
|
||||
class DocumentTypeSettingsAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_type', 'auto_ocr')
|
||||
|
||||
|
||||
@admin.register(DocumentVersionOCRError)
|
||||
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_version', 'datetime_submitted')
|
||||
readonly_fields = ('document_version', 'datetime_submitted', 'result')
|
||||
97
mayan/apps/document_parsing/api_views.py
Normal file
97
mayan/apps/document_parsing/api_views.py
Normal file
@@ -0,0 +1,97 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from rest_framework import generics, status
|
||||
from rest_framework.response import Response
|
||||
|
||||
from documents.models import Document, DocumentPage, DocumentVersion
|
||||
from rest_api.permissions import MayanPermission
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .permissions import permission_ocr_content_view, permission_ocr_document
|
||||
from .serializers import DocumentPageContentSerializer
|
||||
|
||||
|
||||
class APIDocumentOCRView(generics.GenericAPIView):
|
||||
mayan_object_permissions = {
|
||||
'POST': (permission_ocr_document,)
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
queryset = Document.objects.all()
|
||||
|
||||
def get_serializer_class(self):
|
||||
return None
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
"""
|
||||
Submit a document for OCR.
|
||||
---
|
||||
omit_serializer: true
|
||||
parameters:
|
||||
- name: pk
|
||||
paramType: path
|
||||
type: number
|
||||
responseMessages:
|
||||
- code: 202
|
||||
message: Accepted
|
||||
"""
|
||||
|
||||
self.get_object().submit_for_ocr()
|
||||
return Response(status=status.HTTP_202_ACCEPTED)
|
||||
|
||||
|
||||
class APIDocumentVersionOCRView(generics.GenericAPIView):
|
||||
mayan_object_permissions = {
|
||||
'POST': (permission_ocr_document,)
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
queryset = DocumentVersion.objects.all()
|
||||
|
||||
def get_serializer_class(self):
|
||||
return None
|
||||
|
||||
def post(self, request, *args, **kwargs):
|
||||
"""
|
||||
Submit a document version for OCR.
|
||||
---
|
||||
omit_serializer: true
|
||||
parameters:
|
||||
- name: pk
|
||||
paramType: path
|
||||
type: number
|
||||
responseMessages:
|
||||
- code: 202
|
||||
message: Accepted
|
||||
"""
|
||||
|
||||
self.get_object().submit_for_ocr()
|
||||
return Response(status=status.HTTP_202_ACCEPTED)
|
||||
|
||||
|
||||
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
"""
|
||||
Returns the OCR content of the selected document page.
|
||||
---
|
||||
GET:
|
||||
parameters:
|
||||
- name: pk
|
||||
paramType: path
|
||||
type: number
|
||||
"""
|
||||
|
||||
mayan_object_permissions = {
|
||||
'GET': (permission_ocr_content_view,),
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
serializer_class = DocumentPageContentSerializer
|
||||
queryset = DocumentPage.objects.all()
|
||||
|
||||
def retrieve(self, request, *args, **kwargs):
|
||||
instance = self.get_object()
|
||||
|
||||
try:
|
||||
ocr_content = instance.ocr_content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
ocr_content = DocumentPageContent.objects.none()
|
||||
|
||||
serializer = self.get_serializer(ocr_content)
|
||||
return Response(serializer.data)
|
||||
125
mayan/apps/document_parsing/apps.py
Normal file
125
mayan/apps/document_parsing/apps.py
Normal file
@@ -0,0 +1,125 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from kombu import Exchange, Queue
|
||||
|
||||
from django.apps import apps
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from acls import ModelPermission
|
||||
from common import (
|
||||
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
|
||||
menu_tools
|
||||
)
|
||||
from common.settings import settings_db_sync_task_delay
|
||||
from documents.search import document_search, document_page_search
|
||||
from documents.signals import post_version_upload
|
||||
from documents.widgets import document_link
|
||||
from mayan.celery import app
|
||||
from navigation import SourceColumn
|
||||
from rest_api.classes import APIEndPoint
|
||||
|
||||
from .handlers import handler_parse_document_version
|
||||
from .links import (
|
||||
link_document_content, link_entry_list, link_document_content_errors_list,
|
||||
link_document_content_download
|
||||
)
|
||||
from .permissions import permission_content_view
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentParsingApp(MayanAppConfig):
|
||||
has_tests = True
|
||||
name = 'document_parsing'
|
||||
verbose_name = _('Document parsing')
|
||||
|
||||
def ready(self):
|
||||
super(DocumentParsingApp, self).ready()
|
||||
|
||||
APIEndPoint(app=self, version_string='1')
|
||||
|
||||
Document = apps.get_model(
|
||||
app_label='documents', model_name='Document'
|
||||
)
|
||||
|
||||
DocumentType = apps.get_model(
|
||||
app_label='documents', model_name='DocumentType'
|
||||
)
|
||||
|
||||
DocumentVersion = apps.get_model(
|
||||
app_label='documents', model_name='DocumentVersion'
|
||||
)
|
||||
|
||||
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
|
||||
|
||||
ModelPermission.register(
|
||||
model=Document, permissions=(permission_content_view,)
|
||||
)
|
||||
|
||||
SourceColumn(
|
||||
source=DocumentVersionParseError, label=_('Document'),
|
||||
func=lambda context: document_link(context['object'].document_version.document)
|
||||
)
|
||||
SourceColumn(
|
||||
source=DocumentVersionParseError, label=_('Added'),
|
||||
attribute='datetime_submitted'
|
||||
)
|
||||
SourceColumn(
|
||||
source=DocumentVersionParseError, label=_('Result'),
|
||||
attribute='result'
|
||||
)
|
||||
|
||||
document_search.add_model_field(
|
||||
field='versions__pages__content__content', label=_('Content')
|
||||
)
|
||||
|
||||
document_page_search.add_model_field(
|
||||
field='content__content', label=_('Content')
|
||||
)
|
||||
|
||||
menu_facet.bind_links(
|
||||
links=(link_document_content,), sources=(Document,)
|
||||
)
|
||||
menu_multi_item.bind_links(
|
||||
links=(link_document_submit_multiple,), sources=(Document,)
|
||||
)
|
||||
menu_object.bind_links(
|
||||
links=(link_document_submit,), sources=(Document,)
|
||||
)
|
||||
menu_object.bind_links(
|
||||
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
|
||||
)
|
||||
menu_secondary.bind_links(
|
||||
links=(
|
||||
link_document_content, link_document_ocr_erros_list,
|
||||
link_document_ocr_download
|
||||
),
|
||||
sources=(
|
||||
'document_parsing:document_content',
|
||||
'document_parsing:document_ocr_error_list',
|
||||
'document_parsing:document_ocr_download',
|
||||
)
|
||||
)
|
||||
menu_secondary.bind_links(
|
||||
links=(link_entry_list,),
|
||||
sources=(
|
||||
'document_parsing:entry_list',
|
||||
'document_parsing:entry_delete_multiple',
|
||||
'document_parsing:entry_re_queue_multiple',
|
||||
DocumentVersionParseError
|
||||
)
|
||||
)
|
||||
menu_tools.bind_links(
|
||||
links=(
|
||||
link_entry_list
|
||||
)
|
||||
)
|
||||
|
||||
post_version_upload.connect(
|
||||
dispatch_uid='document_parsing_handler_parse_document_version',
|
||||
receiver=handler_parse_document_version,
|
||||
sender=DocumentVersion
|
||||
)
|
||||
22
mayan/apps/document_parsing/exceptions.py
Normal file
22
mayan/apps/document_parsing/exceptions.py
Normal file
@@ -0,0 +1,22 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
|
||||
class OCRError(Exception):
|
||||
"""
|
||||
Raised by the OCR backend
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParserError(Exception):
|
||||
"""
|
||||
Base exception for file parsers
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class NoMIMETypeMatch(ParserError):
|
||||
"""
|
||||
There is no parser registered for the specified MIME type
|
||||
"""
|
||||
pass
|
||||
104
mayan/apps/document_parsing/forms.py
Normal file
104
mayan/apps/document_parsing/forms.py
Normal file
@@ -0,0 +1,104 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django import forms
|
||||
from django.utils.encoding import force_text
|
||||
from django.utils.html import conditional_escape
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.utils.translation import ugettext_lazy as _, ugettext
|
||||
|
||||
from common.widgets import TextAreaDiv
|
||||
from documents.models import DocumentType
|
||||
|
||||
from .models import DocumentPageContent, DocumentPageOCRContent
|
||||
|
||||
|
||||
class DocumentContentForm(forms.Form):
|
||||
"""
|
||||
Form that concatenates all of a document pages' text content into a
|
||||
single textarea widget
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.document = kwargs.pop('instance', None)
|
||||
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
||||
content = []
|
||||
self.fields['contents'].initial = ''
|
||||
try:
|
||||
document_pages = self.document.pages.all()
|
||||
except AttributeError:
|
||||
document_pages = []
|
||||
|
||||
for page in document_pages:
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
content.append(conditional_escape(force_text(page_content)))
|
||||
content.append(
|
||||
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
||||
ugettext(
|
||||
'Page %(page_number)d'
|
||||
) % {'page_number': page.page_number}
|
||||
)
|
||||
)
|
||||
|
||||
self.fields['contents'].initial = mark_safe(''.join(content))
|
||||
|
||||
contents = forms.CharField(
|
||||
label=_('Contents'),
|
||||
widget=TextAreaDiv(
|
||||
attrs={
|
||||
'class': 'text_area_div full-height',
|
||||
'data-height-difference': 360
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRContentForm(forms.Form):
|
||||
"""
|
||||
Form that concatenates all of a document pages' text content into a
|
||||
single textarea widget
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.document = kwargs.pop('instance', None)
|
||||
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
||||
content = []
|
||||
self.fields['contents'].initial = ''
|
||||
try:
|
||||
document_pages = self.document.pages.all()
|
||||
except AttributeError:
|
||||
document_pages = []
|
||||
|
||||
for page in document_pages:
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageOCRContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
content.append(conditional_escape(force_text(page_content)))
|
||||
content.append(
|
||||
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
|
||||
ugettext(
|
||||
'Page %(page_number)d'
|
||||
) % {'page_number': page.page_number}
|
||||
)
|
||||
)
|
||||
|
||||
self.fields['contents'].initial = mark_safe(''.join(content))
|
||||
|
||||
contents = forms.CharField(
|
||||
label=_('Contents'),
|
||||
widget=TextAreaDiv(
|
||||
attrs={
|
||||
'class': 'text_area_div full-height',
|
||||
'data-height-difference': 360
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
class DocumentTypeSelectForm(forms.Form):
|
||||
document_type = forms.ModelChoiceField(
|
||||
queryset=DocumentType.objects.all(), label=('Document type')
|
||||
)
|
||||
15
mayan/apps/document_parsing/handlers.py
Normal file
15
mayan/apps/document_parsing/handlers.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.apps import apps
|
||||
|
||||
from .settings import setting_auto_ocr
|
||||
from .parsers import Parser
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def handler_parse_document_version(sender, instance, **kwargs):
|
||||
if kwargs['created']:
|
||||
Parser.parse_document_version(document_version=instance)
|
||||
27
mayan/apps/document_parsing/links.py
Normal file
27
mayan/apps/document_parsing/links.py
Normal file
@@ -0,0 +1,27 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from navigation import Link
|
||||
|
||||
from .permissions import permission_content_view
|
||||
|
||||
link_document_content = Link(
|
||||
args='resolved_object.id', icon='fa fa-font',
|
||||
permissions=(permission_content_view,), text=_('Content'),
|
||||
view='document_parsing:document_content',
|
||||
)
|
||||
link_entry_list = Link(
|
||||
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
|
||||
text=_('Parsing errors'), view='document_parsing:entry_list'
|
||||
)
|
||||
link_document_content_errors_list = Link(
|
||||
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||
permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
|
||||
view='document_parsing:document_page_parsing_error_list'
|
||||
)
|
||||
link_document_content_download = Link(
|
||||
args='resolved_object.id', icon='fa fa-file-text-o',
|
||||
permissions=(permission_ocr_content_view,), text=_('Download content'),
|
||||
view='document_parsing:document_content_download'
|
||||
)
|
||||
14
mayan/apps/document_parsing/managers.py
Normal file
14
mayan/apps/document_parsing/managers.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
|
||||
from django.apps import apps
|
||||
from django.db import models
|
||||
from django.utils.timezone import now
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentPageContentManager(models.Manager):
|
||||
pass
|
||||
47
mayan/apps/document_parsing/models.py
Normal file
47
mayan/apps/document_parsing/models.py
Normal file
@@ -0,0 +1,47 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import models
|
||||
from django.utils.encoding import force_text, python_2_unicode_compatible
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
||||
|
||||
from .managers import DocumentPageContentManager
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class DocumentPageContent(models.Model):
|
||||
document_page = models.OneToOneField(
|
||||
DocumentPage, on_delete=models.CASCADE, related_name='content',
|
||||
verbose_name=_('Document page')
|
||||
)
|
||||
content = models.TextField(blank=True, verbose_name=_('Content'))
|
||||
|
||||
objects = DocumentPageContentManager()
|
||||
|
||||
def __str__(self):
|
||||
return force_text(self.document_page)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _('Document page content')
|
||||
verbose_name_plural = _('Document pages contents')
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class DocumentVersionParseError(models.Model):
|
||||
document_version = models.ForeignKey(
|
||||
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
|
||||
verbose_name=_('Document version')
|
||||
)
|
||||
datetime_submitted = models.DateTimeField(
|
||||
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
|
||||
)
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||
|
||||
def __str__(self):
|
||||
return force_text(self.document_version)
|
||||
|
||||
class Meta:
|
||||
ordering = ('datetime_submitted',)
|
||||
verbose_name = _('Document version parse error')
|
||||
verbose_name_plural = _('Document version parse errors')
|
||||
202
mayan/apps/document_parsing/parsers.py
Normal file
202
mayan/apps/document_parsing/parsers.py
Normal file
@@ -0,0 +1,202 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import os
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
import subprocess
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import copyfile, fs_cleanup, mkstemp
|
||||
|
||||
from .exceptions import ParserError, NoMIMETypeMatch
|
||||
from .models import DocumentPageContent
|
||||
from .settings import setting_pdftotext_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser(object):
|
||||
"""
|
||||
Parser base class
|
||||
"""
|
||||
|
||||
_registry = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, mimetypes, parser_classes):
|
||||
for mimetype in mimetypes:
|
||||
for parser_class in parser_classes:
|
||||
cls._registry.setdefault(
|
||||
mimetype, []
|
||||
).append(parser_class)
|
||||
|
||||
@classmethod
|
||||
def parse_document_version(cls, document_version):
|
||||
try:
|
||||
for parser_class in cls._registry[document_version.mimetype]:
|
||||
try:
|
||||
parser = parser_class()
|
||||
parser.process_document_version(document_version)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
|
||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
@classmethod
|
||||
def parse_document_page(cls, document_page):
|
||||
try:
|
||||
for parser_class in cls._registry[document_page.document_version.mimetype]:
|
||||
try:
|
||||
parser = parser_class()
|
||||
parser.process_document_page(document_page)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
def process_document_version(self, document_version):
|
||||
logger.info(
|
||||
'Starting parsing for document version: %s', document_version
|
||||
)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
file_object = document_page.document_version.get_intermidiate_file()
|
||||
|
||||
try:
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=file_object, page_number=document_page.page_number
|
||||
)
|
||||
document_page_content.save()
|
||||
except Exception as exception:
|
||||
error_message = _('Exception parsing page; %s') % exception
|
||||
logger.error(error_message)
|
||||
raise ParserError(error_message)
|
||||
finally:
|
||||
file_object.close()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
raise NotImplementedError(
|
||||
'Your %s class has not defined the required execute() method.' %
|
||||
self.__class__.__name__
|
||||
)
|
||||
|
||||
|
||||
class PopplerParser(Parser):
|
||||
"""
|
||||
PDF parser using the pdftotext execute from the poppler package
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdftotext_path = setting_pdftotext_path.value
|
||||
if not os.path.exists(self.pdftotext_path):
|
||||
error_message = _(
|
||||
'Cannot find pdftotext executable at: %s'
|
||||
) % self.pdftotext_path
|
||||
logger.error(error_message)
|
||||
raise ParserError(error_message)
|
||||
|
||||
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
destination_descriptor, temp_filepath = mkstemp()
|
||||
copyfile(file_object, temp_filepath)
|
||||
|
||||
command = []
|
||||
command.append(self.pdftotext_path)
|
||||
command.append('-f')
|
||||
command.append(str(page_number))
|
||||
command.append('-l')
|
||||
command.append(str(page_number))
|
||||
command.append(temp_filepath)
|
||||
command.append('-')
|
||||
|
||||
proc = subprocess.Popen(
|
||||
command, close_fds=True, stderr=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE
|
||||
)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
logger.error(proc.stderr.readline())
|
||||
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
||||
|
||||
raise ParserError
|
||||
|
||||
output = proc.stdout.read()
|
||||
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
||||
|
||||
if output == b'\x0c':
|
||||
logger.debug('Parser didn\'t return any output')
|
||||
return ''
|
||||
|
||||
if output[-3:] == b'\x0a\x0a\x0c':
|
||||
return output[:-3]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class PDFMinerParser(Parser):
|
||||
"""
|
||||
Parser for PDF files using the PDFMiner library for Python
|
||||
"""
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
with BytesIO() as string_buffer:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = TextConverter(
|
||||
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
||||
)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
page = PDFPage.get_pages(
|
||||
file_object, maxpages=1, pagenos=(page_number - 1,)
|
||||
)
|
||||
interpreter.process_page(page.next())
|
||||
device.close()
|
||||
|
||||
logger.debug('Finished parsing PDF: %d', page_number)
|
||||
|
||||
return string_buffer.getvalue()
|
||||
|
||||
|
||||
Parser.register(
|
||||
mimetypes=('application/pdf',),
|
||||
parser_classes=(PopplerParser, PDFMinerParser)
|
||||
)
|
||||
11
mayan/apps/document_parsing/permissions.py
Normal file
11
mayan/apps/document_parsing/permissions.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from permissions import PermissionNamespace
|
||||
|
||||
namespace = PermissionNamespace('document_parsing', _('Document parsing'))
|
||||
|
||||
permission_content_view = namespace.add_permission(
|
||||
name='content_view', label=_('View the content of a document')
|
||||
)
|
||||
10
mayan/apps/document_parsing/queues.py
Normal file
10
mayan/apps/document_parsing/queues.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from task_manager.classes import CeleryQueue
|
||||
|
||||
queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
|
||||
queue_ocr.add_task_type(
|
||||
name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
|
||||
)
|
||||
11
mayan/apps/document_parsing/serializers.py
Normal file
11
mayan/apps/document_parsing/serializers.py
Normal file
@@ -0,0 +1,11 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from rest_framework import serializers
|
||||
|
||||
from .models import DocumentPageContent
|
||||
|
||||
|
||||
class DocumentPageContentSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
fields = ('content',)
|
||||
model = DocumentPageContent
|
||||
17
mayan/apps/document_parsing/settings.py
Normal file
17
mayan/apps/document_parsing/settings.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from smart_settings import Namespace
|
||||
|
||||
namespace = Namespace(name='document_parsing', label=_('Document parsing'))
|
||||
|
||||
setting_pdftotext_path = namespace.add_setting(
|
||||
global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH',
|
||||
default='/usr/bin/pdftotext',
|
||||
help_text=_(
|
||||
'File path to poppler\'s pdftotext program used to extract text '
|
||||
'from PDF files.'
|
||||
),
|
||||
is_path=True
|
||||
)
|
||||
0
mayan/apps/document_parsing/tests/__init__.py
Normal file
0
mayan/apps/document_parsing/tests/__init__.py
Normal file
88
mayan/apps/document_parsing/tests/test_api.py
Normal file
88
mayan/apps/document_parsing/tests/test_api.py
Normal file
@@ -0,0 +1,88 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import json
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.urls import reverse
|
||||
|
||||
from rest_framework import status
|
||||
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||
from rest_api.tests import BaseAPITestCase
|
||||
from user_management.tests import (
|
||||
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
|
||||
)
|
||||
|
||||
|
||||
class OCRAPITestCase(BaseAPITestCase):
|
||||
"""
|
||||
Test the OCR app API endpoints
|
||||
"""
|
||||
|
||||
def setUp(self):
|
||||
super(OCRAPITestCase, self).setUp()
|
||||
|
||||
self.admin_user = get_user_model().objects.create_superuser(
|
||||
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
|
||||
password=TEST_ADMIN_PASSWORD
|
||||
)
|
||||
|
||||
self.client.login(
|
||||
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
|
||||
)
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(OCRAPITestCase, self).tearDown()
|
||||
|
||||
def test_submit_document(self):
|
||||
response = self.client.post(
|
||||
reverse(
|
||||
'rest_api:document-ocr-submit-view',
|
||||
args=(self.document.pk,)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
def test_submit_document_version(self):
|
||||
response = self.client.post(
|
||||
reverse(
|
||||
'rest_api:document-version-ocr-submit-view',
|
||||
args=(self.document.latest_version.pk,)
|
||||
)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
|
||||
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
def test_get_document_version_page_content(self):
|
||||
response = self.client.get(
|
||||
reverse(
|
||||
'rest_api:document-page-content-view',
|
||||
args=(self.document.latest_version.pages.first().pk,)
|
||||
),
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, status.HTTP_200_OK)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in json.loads(response.content)['content']
|
||||
)
|
||||
41
mayan/apps/document_parsing/tests/test_events.py
Normal file
41
mayan/apps/document_parsing/tests/test_events.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from actstream.models import Action
|
||||
|
||||
from documents.tests.test_models import GenericDocumentTestCase
|
||||
|
||||
from ..events import (
|
||||
event_ocr_document_version_submit, event_ocr_document_version_finish
|
||||
)
|
||||
|
||||
|
||||
class OCREventsTestCase(GenericDocumentTestCase):
|
||||
def test_document_version_submit_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.first().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.first().verb,
|
||||
event_ocr_document_version_submit.name
|
||||
)
|
||||
|
||||
def test_document_version_finish_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
from ..models import DocumentVersionOCRError, DocumentPageContent
|
||||
#print DocumentVersionOCRError.objects.all()
|
||||
print DocumentPageContent.objects.all()
|
||||
|
||||
for a in Action.objects.all():
|
||||
print a
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.last().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.last().verb,
|
||||
event_ocr_document_version_finish.name
|
||||
)
|
||||
77
mayan/apps/document_parsing/tests/test_models.py
Normal file
77
mayan/apps/document_parsing/tests/test_models.py
Normal file
@@ -0,0 +1,77 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.settings import setting_language_choices
|
||||
from documents.tests import (
|
||||
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRTestCase(BaseTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(DocumentOCRTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object,
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document.delete()
|
||||
self.document_type.delete()
|
||||
super(DocumentOCRTestCase, self).tearDown()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
self.assertTrue('Mayan EDMS Documentation' in content)
|
||||
|
||||
|
||||
class GermanOCRSupportTestCase(BaseTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(GermanOCRSupportTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
# Get corresponding language code for German from the default language
|
||||
# choices list
|
||||
language_code = [
|
||||
language for language in setting_language_choices.value if language[1] == 'German'
|
||||
][0][0]
|
||||
|
||||
self.assertEqual('deu', language_code)
|
||||
|
||||
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=file_object, language=language_code
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(GermanOCRSupportTestCase, self).tearDown()
|
||||
|
||||
def test_ocr_language_backends_end(self):
|
||||
content = self.document.pages.first().ocr_content.content
|
||||
|
||||
self.assertTrue(
|
||||
'Repository für elektronische Dokumente.' in content
|
||||
)
|
||||
self.assertTrue(
|
||||
'Es bietet einen' in content
|
||||
)
|
||||
83
mayan/apps/document_parsing/tests/test_parsers.py
Normal file
83
mayan/apps/document_parsing/tests/test_parsers.py
Normal file
@@ -0,0 +1,83 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
from django.test import override_settings
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import (
|
||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
from ..classes import TextExtractor
|
||||
from ..parsers import PDFMinerParser, PopplerParser
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class ParserTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(ParserTestCase, self).setUp()
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(ParserTestCase, self).tearDown()
|
||||
|
||||
def test_pdfminer_parser(self):
|
||||
parser = PDFMinerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
def test_poppler_parser(self):
|
||||
parser = PopplerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class TextExtractorTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TextExtractorTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(TextExtractorTestCase, self).tearDown()
|
||||
|
||||
def test_text_extractor(self):
|
||||
TextExtractor.process_document_version(
|
||||
document_version=self.document.latest_version
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.first().ocr_content.content,
|
||||
'Sample text',
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.last().ocr_content.content,
|
||||
'Sample text in image form',
|
||||
)
|
||||
61
mayan/apps/document_parsing/tests/test_views.py
Normal file
61
mayan/apps/document_parsing/tests/test_views.py
Normal file
@@ -0,0 +1,61 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from documents.tests.test_views import GenericDocumentViewTestCase
|
||||
|
||||
from ..permissions import permission_ocr_content_view
|
||||
from ..utils import get_document_ocr_content
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=True)
|
||||
class OCRViewsTestCase(GenericDocumentViewTestCase):
|
||||
# PyOCR's leak descriptor in get_available_languages and image_to_string
|
||||
# Disable descriptor leak test until fixed in upstream
|
||||
_skip_file_descriptor_test = True
|
||||
|
||||
def setUp(self):
|
||||
super(OCRViewsTestCase, self).setUp()
|
||||
self.login_user()
|
||||
|
||||
def _document_content_view(self):
|
||||
return self.get(
|
||||
'ocr:document_content', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
def test_document_content_view_no_permissions(self):
|
||||
response = self._document_content_view()
|
||||
|
||||
self.assertEqual(response.status_code, 403)
|
||||
|
||||
def test_document_content_view_with_permission(self):
|
||||
self.grant_permission(permission=permission_ocr_content_view)
|
||||
|
||||
response = self._document_content_view()
|
||||
|
||||
self.assertContains(
|
||||
response, 'Mayan EDMS Documentation', status_code=200
|
||||
)
|
||||
|
||||
def test_document_ocr_download_view_no_permission(self):
|
||||
response = self.get(
|
||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 403)
|
||||
|
||||
def test_document_download_view_with_permission(self):
|
||||
self.expected_content_type = 'application/octet-stream; charset=utf-8'
|
||||
|
||||
self.grant_permission(permission=permission_ocr_content_view)
|
||||
response = self.get(
|
||||
'ocr:document_ocr_download', args=(self.document.pk,)
|
||||
)
|
||||
|
||||
self.assertEqual(response.status_code, 200)
|
||||
|
||||
self.assert_download_response(
|
||||
response, content=(
|
||||
''.join(get_document_ocr_content(document=self.document))
|
||||
),
|
||||
)
|
||||
65
mayan/apps/document_parsing/urls.py
Normal file
65
mayan/apps/document_parsing/urls.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.conf.urls import url
|
||||
|
||||
from .api_views import (
|
||||
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
|
||||
)
|
||||
from .views import (
|
||||
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
||||
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
|
||||
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
|
||||
)
|
||||
|
||||
urlpatterns = [
|
||||
url(
|
||||
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
|
||||
name='document_content'
|
||||
),
|
||||
url(
|
||||
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
|
||||
name='document_submit'
|
||||
),
|
||||
url(
|
||||
r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
|
||||
name='document_submit_all'
|
||||
),
|
||||
url(
|
||||
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
|
||||
name='document_type_submit'
|
||||
),
|
||||
url(
|
||||
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
|
||||
name='document_submit_multiple'
|
||||
),
|
||||
url(
|
||||
r'^document_type/(?P<pk>\d+)/ocr/settings/$',
|
||||
DocumentTypeSettingsEditView.as_view(),
|
||||
name='document_type_ocr_settings'
|
||||
),
|
||||
url(
|
||||
r'^documents/(?P<pk>\d+)/ocr/errors/$',
|
||||
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
|
||||
),
|
||||
url(
|
||||
r'^documents/(?P<pk>\d+)/ocr/download/$',
|
||||
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
|
||||
),
|
||||
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
|
||||
]
|
||||
|
||||
api_urls = [
|
||||
url(
|
||||
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
|
||||
name='document-ocr-submit-view'
|
||||
),
|
||||
url(
|
||||
r'^document_version/(?P<pk>\d+)/submit/$',
|
||||
APIDocumentVersionOCRView.as_view(),
|
||||
name='document-version-ocr-submit-view'
|
||||
),
|
||||
url(
|
||||
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
||||
name='document-page-content-view'
|
||||
),
|
||||
]
|
||||
16
mayan/apps/document_parsing/utils.py
Normal file
16
mayan/apps/document_parsing/utils.py
Normal file
@@ -0,0 +1,16 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.encoding import force_text
|
||||
from django.utils.html import conditional_escape
|
||||
|
||||
from .models import DocumentPageContent
|
||||
|
||||
|
||||
def get_document_ocr_content(document):
|
||||
for page in document.pages.all():
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
yield conditional_escape(force_text(page_content))
|
||||
190
mayan/apps/document_parsing/views.py
Normal file
190
mayan/apps/document_parsing/views.py
Normal file
@@ -0,0 +1,190 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.contrib import messages
|
||||
from django.http import HttpResponseRedirect
|
||||
from django.shortcuts import get_object_or_404
|
||||
from django.urls import reverse
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from acls.models import AccessControlList
|
||||
from common.generics import (
|
||||
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
|
||||
SingleObjectEditView, SingleObjectListView
|
||||
)
|
||||
from common.mixins import MultipleInstanceActionMixin
|
||||
from documents.models import Document, DocumentType
|
||||
|
||||
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import (
|
||||
permission_ocr_content_view, permission_ocr_document,
|
||||
permission_document_type_ocr_setup
|
||||
)
|
||||
from .utils import get_document_ocr_content
|
||||
|
||||
|
||||
class DocumentAllSubmitView(ConfirmView):
|
||||
extra_context = {'title': _('Submit all documents for OCR?')}
|
||||
|
||||
def get_post_action_redirect(self):
|
||||
return reverse('common:tools_list')
|
||||
|
||||
def view_action(self):
|
||||
count = 0
|
||||
for document in Document.objects.all():
|
||||
document.submit_for_ocr()
|
||||
count += 1
|
||||
|
||||
messages.success(
|
||||
self.request, _('%d documents added to the OCR queue.') % count
|
||||
)
|
||||
|
||||
|
||||
class DocumentSubmitView(ConfirmView):
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'object': self.get_object(),
|
||||
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
|
||||
}
|
||||
|
||||
def get_object(self):
|
||||
return Document.objects.get(pk=self.kwargs['pk'])
|
||||
|
||||
def object_action(self, instance):
|
||||
AccessControlList.objects.check_access(
|
||||
permissions=permission_ocr_document, user=self.request.user,
|
||||
obj=instance
|
||||
)
|
||||
|
||||
instance.submit_for_ocr()
|
||||
|
||||
def view_action(self):
|
||||
instance = self.get_object()
|
||||
|
||||
self.object_action(instance=instance)
|
||||
|
||||
messages.success(
|
||||
self.request,
|
||||
_('Document: %(document)s was added to the OCR queue.') % {
|
||||
'document': instance
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
|
||||
model = Document
|
||||
success_message = '%(count)d document submitted to the OCR queue.'
|
||||
success_message_plural = '%(count)d documents submitted to the OCR queue.'
|
||||
|
||||
def get_extra_context(self):
|
||||
# Override the base class method
|
||||
return {
|
||||
'title': _('Submit the selected documents to the OCR queue?')
|
||||
}
|
||||
|
||||
|
||||
class DocumentTypeSubmitView(FormView):
|
||||
form_class = DocumentTypeSelectForm
|
||||
extra_context = {
|
||||
'title': _('Submit all documents of a type for OCR')
|
||||
}
|
||||
|
||||
def get_post_action_redirect(self):
|
||||
return reverse('common:tools_list')
|
||||
|
||||
def form_valid(self, form):
|
||||
count = 0
|
||||
for document in form.cleaned_data['document_type'].documents.all():
|
||||
document.submit_for_ocr()
|
||||
count += 1
|
||||
|
||||
messages.success(
|
||||
self.request, _(
|
||||
'%(count)d documents of type "%(document_type)s" added to the '
|
||||
'OCR queue.'
|
||||
) % {
|
||||
'count': count,
|
||||
'document_type': form.cleaned_data['document_type']
|
||||
}
|
||||
)
|
||||
|
||||
return HttpResponseRedirect(self.get_success_url())
|
||||
|
||||
|
||||
class DocumentTypeSettingsEditView(SingleObjectEditView):
|
||||
fields = ('auto_ocr',)
|
||||
view_permission = permission_document_type_ocr_setup
|
||||
|
||||
def get_object(self, queryset=None):
|
||||
return get_object_or_404(
|
||||
DocumentType, pk=self.kwargs['pk']
|
||||
).ocr_settings
|
||||
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'title': _(
|
||||
'Edit OCR settings for document type: %s'
|
||||
) % self.get_object().document_type
|
||||
}
|
||||
|
||||
|
||||
class DocumentOCRContent(SingleObjectDetailView):
|
||||
form_class = DocumentContentForm
|
||||
model = Document
|
||||
object_permission = permission_ocr_content_view
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
result = super(DocumentOCRContent, self).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
self.get_object().add_as_recent_document_for_user(request.user)
|
||||
return result
|
||||
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'document': self.get_object(),
|
||||
'hide_labels': True,
|
||||
'object': self.get_object(),
|
||||
'title': _('OCR result for document: %s') % self.get_object(),
|
||||
}
|
||||
|
||||
|
||||
class EntryListView(SingleObjectListView):
|
||||
extra_context = {
|
||||
'hide_object': True,
|
||||
'title': _('OCR errors'),
|
||||
}
|
||||
view_permission = permission_ocr_document
|
||||
|
||||
def get_object_list(self):
|
||||
return DocumentVersionOCRError.objects.all()
|
||||
|
||||
|
||||
class DocumentOCRErrorsListView(SingleObjectListView):
|
||||
view_permission = permission_ocr_document
|
||||
|
||||
def get_document(self):
|
||||
return get_object_or_404(Document, pk=self.kwargs['pk'])
|
||||
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'hide_object': True,
|
||||
'object': self.get_document(),
|
||||
'title': _('OCR errors for document: %s') % self.get_document(),
|
||||
}
|
||||
|
||||
def get_object_list(self):
|
||||
return self.get_document().latest_version.ocr_errors.all()
|
||||
|
||||
|
||||
class DocumentOCRDownloadView(SingleObjectDownloadView):
|
||||
model = Document
|
||||
object_permission = permission_ocr_content_view
|
||||
|
||||
def get_file(self):
|
||||
file_object = DocumentOCRDownloadView.TextIteratorIO(
|
||||
iterator=get_document_ocr_content(document=self.get_object())
|
||||
)
|
||||
return DocumentOCRDownloadView.VirtualFile(
|
||||
file=file_object, name='{}-OCR'.format(self.get_object())
|
||||
)
|
||||
@@ -84,6 +84,7 @@ INSTALLED_APPS = (
|
||||
'checkouts',
|
||||
'document_comments',
|
||||
'document_indexing',
|
||||
'document_parsing',
|
||||
'document_signatures',
|
||||
'document_states',
|
||||
'documents',
|
||||
|
||||
Reference in New Issue
Block a user