Initial commit of the document parsing app.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-08-23 02:23:14 -04:00
parent 317d07a355
commit e9591c92f9
25 changed files with 1350 additions and 0 deletions

View File

@@ -0,0 +1,3 @@
from __future__ import unicode_literals
default_app_config = 'document_parsing.apps.DocumentParsingApp'

View File

@@ -0,0 +1,23 @@
from __future__ import unicode_literals
from django.contrib import admin
from .models import (
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
)
@admin.register(DocumentPageContent)
class DocumentPageContentAdmin(admin.ModelAdmin):
list_display = ('document_page',)
@admin.register(DocumentTypeSettings)
class DocumentTypeSettingsAdmin(admin.ModelAdmin):
list_display = ('document_type', 'auto_ocr')
@admin.register(DocumentVersionOCRError)
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
list_display = ('document_version', 'datetime_submitted')
readonly_fields = ('document_version', 'datetime_submitted', 'result')

View File

@@ -0,0 +1,97 @@
from __future__ import absolute_import, unicode_literals
from rest_framework import generics, status
from rest_framework.response import Response
from documents.models import Document, DocumentPage, DocumentVersion
from rest_api.permissions import MayanPermission
from .models import DocumentPageContent
from .permissions import permission_ocr_content_view, permission_ocr_document
from .serializers import DocumentPageContentSerializer
class APIDocumentOCRView(generics.GenericAPIView):
mayan_object_permissions = {
'POST': (permission_ocr_document,)
}
permission_classes = (MayanPermission,)
queryset = Document.objects.all()
def get_serializer_class(self):
return None
def post(self, request, *args, **kwargs):
"""
Submit a document for OCR.
---
omit_serializer: true
parameters:
- name: pk
paramType: path
type: number
responseMessages:
- code: 202
message: Accepted
"""
self.get_object().submit_for_ocr()
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentVersionOCRView(generics.GenericAPIView):
mayan_object_permissions = {
'POST': (permission_ocr_document,)
}
permission_classes = (MayanPermission,)
queryset = DocumentVersion.objects.all()
def get_serializer_class(self):
return None
def post(self, request, *args, **kwargs):
"""
Submit a document version for OCR.
---
omit_serializer: true
parameters:
- name: pk
paramType: path
type: number
responseMessages:
- code: 202
message: Accepted
"""
self.get_object().submit_for_ocr()
return Response(status=status.HTTP_202_ACCEPTED)
class APIDocumentPageContentView(generics.RetrieveAPIView):
"""
Returns the OCR content of the selected document page.
---
GET:
parameters:
- name: pk
paramType: path
type: number
"""
mayan_object_permissions = {
'GET': (permission_ocr_content_view,),
}
permission_classes = (MayanPermission,)
serializer_class = DocumentPageContentSerializer
queryset = DocumentPage.objects.all()
def retrieve(self, request, *args, **kwargs):
instance = self.get_object()
try:
ocr_content = instance.ocr_content
except DocumentPageContent.DoesNotExist:
ocr_content = DocumentPageContent.objects.none()
serializer = self.get_serializer(ocr_content)
return Response(serializer.data)

View File

@@ -0,0 +1,125 @@
from __future__ import unicode_literals
import logging
from kombu import Exchange, Queue
from django.apps import apps
from django.db.models.signals import post_save
from django.utils.translation import ugettext_lazy as _
from acls import ModelPermission
from common import (
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
menu_tools
)
from common.settings import settings_db_sync_task_delay
from documents.search import document_search, document_page_search
from documents.signals import post_version_upload
from documents.widgets import document_link
from mayan.celery import app
from navigation import SourceColumn
from rest_api.classes import APIEndPoint
from .handlers import handler_parse_document_version
from .links import (
link_document_content, link_entry_list, link_document_content_errors_list,
link_document_content_download
)
from .permissions import permission_content_view
logger = logging.getLogger(__name__)
class DocumentParsingApp(MayanAppConfig):
has_tests = True
name = 'document_parsing'
verbose_name = _('Document parsing')
def ready(self):
super(DocumentParsingApp, self).ready()
APIEndPoint(app=self, version_string='1')
Document = apps.get_model(
app_label='documents', model_name='Document'
)
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
DocumentVersion = apps.get_model(
app_label='documents', model_name='DocumentVersion'
)
DocumentVersionParseError = self.get_model('DocumentVersionParseError')
ModelPermission.register(
model=Document, permissions=(permission_content_view,)
)
SourceColumn(
source=DocumentVersionParseError, label=_('Document'),
func=lambda context: document_link(context['object'].document_version.document)
)
SourceColumn(
source=DocumentVersionParseError, label=_('Added'),
attribute='datetime_submitted'
)
SourceColumn(
source=DocumentVersionParseError, label=_('Result'),
attribute='result'
)
document_search.add_model_field(
field='versions__pages__content__content', label=_('Content')
)
document_page_search.add_model_field(
field='content__content', label=_('Content')
)
menu_facet.bind_links(
links=(link_document_content,), sources=(Document,)
)
menu_multi_item.bind_links(
links=(link_document_submit_multiple,), sources=(Document,)
)
menu_object.bind_links(
links=(link_document_submit,), sources=(Document,)
)
menu_object.bind_links(
links=(link_document_type_ocr_settings,), sources=(DocumentType,)
)
menu_secondary.bind_links(
links=(
link_document_content, link_document_ocr_erros_list,
link_document_ocr_download
),
sources=(
'document_parsing:document_content',
'document_parsing:document_ocr_error_list',
'document_parsing:document_ocr_download',
)
)
menu_secondary.bind_links(
links=(link_entry_list,),
sources=(
'document_parsing:entry_list',
'document_parsing:entry_delete_multiple',
'document_parsing:entry_re_queue_multiple',
DocumentVersionParseError
)
)
menu_tools.bind_links(
links=(
link_entry_list
)
)
post_version_upload.connect(
dispatch_uid='document_parsing_handler_parse_document_version',
receiver=handler_parse_document_version,
sender=DocumentVersion
)

View File

@@ -0,0 +1,22 @@
from __future__ import unicode_literals
class OCRError(Exception):
"""
Raised by the OCR backend
"""
pass
class ParserError(Exception):
"""
Base exception for file parsers
"""
pass
class NoMIMETypeMatch(ParserError):
"""
There is no parser registered for the specified MIME type
"""
pass

View File

@@ -0,0 +1,104 @@
from __future__ import unicode_literals
from django import forms
from django.utils.encoding import force_text
from django.utils.html import conditional_escape
from django.utils.safestring import mark_safe
from django.utils.translation import ugettext_lazy as _, ugettext
from common.widgets import TextAreaDiv
from documents.models import DocumentType
from .models import DocumentPageContent, DocumentPageOCRContent
class DocumentContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
document_pages = self.document.pages.all()
except AttributeError:
document_pages = []
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))
content.append(
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
ugettext(
'Page %(page_number)d'
) % {'page_number': page.page_number}
)
)
self.fields['contents'].initial = mark_safe(''.join(content))
contents = forms.CharField(
label=_('Contents'),
widget=TextAreaDiv(
attrs={
'class': 'text_area_div full-height',
'data-height-difference': 360
}
)
)
class DocumentOCRContentForm(forms.Form):
"""
Form that concatenates all of a document pages' text content into a
single textarea widget
"""
def __init__(self, *args, **kwargs):
self.document = kwargs.pop('instance', None)
super(DocumentContentForm, self).__init__(*args, **kwargs)
content = []
self.fields['contents'].initial = ''
try:
document_pages = self.document.pages.all()
except AttributeError:
document_pages = []
for page in document_pages:
try:
page_content = page.ocr_content.content
except DocumentPageOCRContent.DoesNotExist:
pass
else:
content.append(conditional_escape(force_text(page_content)))
content.append(
'\n\n\n<hr/><div class="document-page-content-divider">- %s -</div><hr/>\n\n\n' % (
ugettext(
'Page %(page_number)d'
) % {'page_number': page.page_number}
)
)
self.fields['contents'].initial = mark_safe(''.join(content))
contents = forms.CharField(
label=_('Contents'),
widget=TextAreaDiv(
attrs={
'class': 'text_area_div full-height',
'data-height-difference': 360
}
)
)
class DocumentTypeSelectForm(forms.Form):
document_type = forms.ModelChoiceField(
queryset=DocumentType.objects.all(), label=('Document type')
)

View File

@@ -0,0 +1,15 @@
from __future__ import unicode_literals
import logging
from django.apps import apps
from .settings import setting_auto_ocr
from .parsers import Parser
logger = logging.getLogger(__name__)
def handler_parse_document_version(sender, instance, **kwargs):
if kwargs['created']:
Parser.parse_document_version(document_version=instance)

View File

@@ -0,0 +1,27 @@
from __future__ import unicode_literals
from django.utils.translation import ugettext_lazy as _
from navigation import Link
from .permissions import permission_content_view
link_document_content = Link(
args='resolved_object.id', icon='fa fa-font',
permissions=(permission_content_view,), text=_('Content'),
view='document_parsing:document_content',
)
link_entry_list = Link(
icon='fa fa-file-text-o', permissions=(permission_ocr_document,),
text=_('Parsing errors'), view='document_parsing:entry_list'
)
link_document_content_errors_list = Link(
args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Parsing errors'),
view='document_parsing:document_page_parsing_error_list'
)
link_document_content_download = Link(
args='resolved_object.id', icon='fa fa-file-text-o',
permissions=(permission_ocr_content_view,), text=_('Download content'),
view='document_parsing:document_content_download'
)

View File

@@ -0,0 +1,14 @@
from __future__ import unicode_literals
from datetime import timedelta
import logging
from django.apps import apps
from django.db import models
from django.utils.timezone import now
logger = logging.getLogger(__name__)
class DocumentPageContentManager(models.Manager):
pass

View File

@@ -0,0 +1,47 @@
from __future__ import unicode_literals
from django.db import models
from django.utils.encoding import force_text, python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _
from documents.models import DocumentPage, DocumentType, DocumentVersion
from .managers import DocumentPageContentManager
@python_2_unicode_compatible
class DocumentPageContent(models.Model):
document_page = models.OneToOneField(
DocumentPage, on_delete=models.CASCADE, related_name='content',
verbose_name=_('Document page')
)
content = models.TextField(blank=True, verbose_name=_('Content'))
objects = DocumentPageContentManager()
def __str__(self):
return force_text(self.document_page)
class Meta:
verbose_name = _('Document page content')
verbose_name_plural = _('Document pages contents')
@python_2_unicode_compatible
class DocumentVersionParseError(models.Model):
document_version = models.ForeignKey(
DocumentVersion, on_delete=models.CASCADE, related_name='parse_errors',
verbose_name=_('Document version')
)
datetime_submitted = models.DateTimeField(
auto_add_now=True, db_index=True, verbose_name=_('Date time submitted')
)
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
def __str__(self):
return force_text(self.document_version)
class Meta:
ordering = ('datetime_submitted',)
verbose_name = _('Document version parse error')
verbose_name_plural = _('Document version parse errors')

View File

@@ -0,0 +1,202 @@
from __future__ import unicode_literals
from io import BytesIO
import logging
import os
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfpage import PDFPage
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
import subprocess
from django.utils.translation import ugettext_lazy as _
from common.utils import copyfile, fs_cleanup, mkstemp
from .exceptions import ParserError, NoMIMETypeMatch
from .models import DocumentPageContent
from .settings import setting_pdftotext_path
logger = logging.getLogger(__name__)
class Parser(object):
"""
Parser base class
"""
_registry = {}
@classmethod
def register(cls, mimetypes, parser_classes):
for mimetype in mimetypes:
for parser_class in parser_classes:
cls._registry.setdefault(
mimetype, []
).append(parser_class)
@classmethod
def parse_document_version(cls, document_version):
try:
for parser_class in cls._registry[document_version.mimetype]:
try:
parser = parser_class()
parser.process_document_version(document_version)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
@classmethod
def parse_document_page(cls, document_page):
try:
for parser_class in cls._registry[document_page.document_version.mimetype]:
try:
parser = parser_class()
parser.process_document_page(document_page)
except ParserError:
# If parser raises error, try next parser in the list
pass
else:
# If parser was successfull there is no need to try
# others in the list for this mimetype
return
raise NoMIMETypeMatch('Parser MIME type list exhausted')
except KeyError:
raise NoMIMETypeMatch
def process_document_version(self, document_version):
logger.info(
'Starting parsing for document version: %s', document_version
)
logger.debug('document version: %d', document_version.pk)
for document_page in document_version.pages.all():
self.process_document_page(document_page=document_page)
def process_document_page(self, document_page):
logger.info(
'Processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
file_object = document_page.document_version.get_intermidiate_file()
try:
document_page_content, created = DocumentPageContent.objects.get_or_create(
document_page=document_page
)
document_page_content.content = self.execute(
file_object=file_object, page_number=document_page.page_number
)
document_page_content.save()
except Exception as exception:
error_message = _('Exception parsing page; %s') % exception
logger.error(error_message)
raise ParserError(error_message)
finally:
file_object.close()
logger.info(
'Finished processing page: %d of document version: %s',
document_page.page_number, document_page.document_version
)
def execute(self, file_object, page_number):
raise NotImplementedError(
'Your %s class has not defined the required execute() method.' %
self.__class__.__name__
)
class PopplerParser(Parser):
"""
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = setting_pdftotext_path.value
if not os.path.exists(self.pdftotext_path):
error_message = _(
'Cannot find pdftotext executable at: %s'
) % self.pdftotext_path
logger.error(error_message)
raise ParserError(error_message)
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
destination_descriptor, temp_filepath = mkstemp()
copyfile(file_object, temp_filepath)
command = []
command.append(self.pdftotext_path)
command.append('-f')
command.append(str(page_number))
command.append('-l')
command.append(str(page_number))
command.append(temp_filepath)
command.append('-')
proc = subprocess.Popen(
command, close_fds=True, stderr=subprocess.PIPE,
stdout=subprocess.PIPE
)
return_code = proc.wait()
if return_code != 0:
logger.error(proc.stderr.readline())
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
raise ParserError
output = proc.stdout.read()
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
if output == b'\x0c':
logger.debug('Parser didn\'t return any output')
return ''
if output[-3:] == b'\x0a\x0a\x0c':
return output[:-3]
return output
class PDFMinerParser(Parser):
"""
Parser for PDF files using the PDFMiner library for Python
"""
def execute(self, file_object, page_number):
logger.debug('Parsing PDF page: %d', page_number)
with BytesIO() as string_buffer:
rsrcmgr = PDFResourceManager()
device = TextConverter(
rsrcmgr, outfp=string_buffer, laparams=LAParams()
)
interpreter = PDFPageInterpreter(rsrcmgr, device)
page = PDFPage.get_pages(
file_object, maxpages=1, pagenos=(page_number - 1,)
)
interpreter.process_page(page.next())
device.close()
logger.debug('Finished parsing PDF: %d', page_number)
return string_buffer.getvalue()
Parser.register(
mimetypes=('application/pdf',),
parser_classes=(PopplerParser, PDFMinerParser)
)

View File

@@ -0,0 +1,11 @@
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from permissions import PermissionNamespace
namespace = PermissionNamespace('document_parsing', _('Document parsing'))
permission_content_view = namespace.add_permission(
name='content_view', label=_('View the content of a document')
)

View File

@@ -0,0 +1,10 @@
from __future__ import unicode_literals
from django.utils.translation import ugettext_lazy as _
from task_manager.classes import CeleryQueue
queue_ocr = CeleryQueue(name='ocr', label=_('OCR'))
queue_ocr.add_task_type(
name='ocr.tasks.task_do_ocr', label=_('Document version OCR')
)

View File

@@ -0,0 +1,11 @@
from __future__ import unicode_literals
from rest_framework import serializers
from .models import DocumentPageContent
class DocumentPageContentSerializer(serializers.ModelSerializer):
class Meta:
fields = ('content',)
model = DocumentPageContent

View File

@@ -0,0 +1,17 @@
from __future__ import unicode_literals
from django.utils.translation import ugettext_lazy as _
from smart_settings import Namespace
namespace = Namespace(name='document_parsing', label=_('Document parsing'))
setting_pdftotext_path = namespace.add_setting(
global_name='DOCUMENT_PARSING_PDFTOTEXT_PATH',
default='/usr/bin/pdftotext',
help_text=_(
'File path to poppler\'s pdftotext program used to extract text '
'from PDF files.'
),
is_path=True
)

View File

@@ -0,0 +1,88 @@
from __future__ import unicode_literals
import json
from django.contrib.auth import get_user_model
from django.urls import reverse
from rest_framework import status
from documents.models import DocumentType
from documents.tests import TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
from rest_api.tests import BaseAPITestCase
from user_management.tests import (
TEST_ADMIN_EMAIL, TEST_ADMIN_PASSWORD, TEST_ADMIN_USERNAME
)
class OCRAPITestCase(BaseAPITestCase):
"""
Test the OCR app API endpoints
"""
def setUp(self):
super(OCRAPITestCase, self).setUp()
self.admin_user = get_user_model().objects.create_superuser(
username=TEST_ADMIN_USERNAME, email=TEST_ADMIN_EMAIL,
password=TEST_ADMIN_PASSWORD
)
self.client.login(
username=TEST_ADMIN_USERNAME, password=TEST_ADMIN_PASSWORD
)
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document_type.delete()
super(OCRAPITestCase, self).tearDown()
def test_submit_document(self):
response = self.client.post(
reverse(
'rest_api:document-ocr-submit-view',
args=(self.document.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_submit_document_version(self):
response = self.client.post(
reverse(
'rest_api:document-version-ocr-submit-view',
args=(self.document.latest_version.pk,)
)
)
self.assertEqual(response.status_code, status.HTTP_202_ACCEPTED)
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
def test_get_document_version_page_content(self):
response = self.client.get(
reverse(
'rest_api:document-page-content-view',
args=(self.document.latest_version.pages.first().pk,)
),
)
self.assertEqual(response.status_code, status.HTTP_200_OK)
self.assertTrue(
'Mayan EDMS Documentation' in json.loads(response.content)['content']
)

View File

@@ -0,0 +1,41 @@
from __future__ import unicode_literals
from actstream.models import Action
from documents.tests.test_models import GenericDocumentTestCase
from ..events import (
event_ocr_document_version_submit, event_ocr_document_version_finish
)
class OCREventsTestCase(GenericDocumentTestCase):
def test_document_version_submit_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
self.assertEqual(
Action.objects.first().target, self.document.latest_version
)
self.assertEqual(
Action.objects.first().verb,
event_ocr_document_version_submit.name
)
def test_document_version_finish_event(self):
Action.objects.all().delete()
self.document.submit_for_ocr()
from ..models import DocumentVersionOCRError, DocumentPageContent
#print DocumentVersionOCRError.objects.all()
print DocumentPageContent.objects.all()
for a in Action.objects.all():
print a
self.assertEqual(
Action.objects.last().target, self.document.latest_version
)
self.assertEqual(
Action.objects.last().verb,
event_ocr_document_version_finish.name
)

View File

@@ -0,0 +1,77 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.settings import setting_language_choices
from documents.tests import (
TEST_DEU_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_SMALL_DOCUMENT_PATH
)
class DocumentOCRTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(DocumentOCRTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object,
)
def tearDown(self):
self.document.delete()
self.document_type.delete()
super(DocumentOCRTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue('Mayan EDMS Documentation' in content)
class GermanOCRSupportTestCase(BaseTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(GermanOCRSupportTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
# Get corresponding language code for German from the default language
# choices list
language_code = [
language for language in setting_language_choices.value if language[1] == 'German'
][0][0]
self.assertEqual('deu', language_code)
with open(TEST_DEU_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=file_object, language=language_code
)
def tearDown(self):
self.document_type.delete()
super(GermanOCRSupportTestCase, self).tearDown()
def test_ocr_language_backends_end(self):
content = self.document.pages.first().ocr_content.content
self.assertTrue(
'Repository für elektronische Dokumente.' in content
)
self.assertTrue(
'Es bietet einen' in content
)

View File

@@ -0,0 +1,83 @@
from __future__ import unicode_literals
from django.core.files.base import File
from django.test import override_settings
from common.tests import BaseTestCase
from documents.models import DocumentType
from documents.tests import (
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
)
from ..classes import TextExtractor
from ..parsers import PDFMinerParser, PopplerParser
@override_settings(OCR_AUTO_OCR=False)
class ParserTestCase(BaseTestCase):
def setUp(self):
super(ParserTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(ParserTestCase, self).tearDown()
def test_pdfminer_parser(self):
parser = PDFMinerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
def test_poppler_parser(self):
parser = PopplerParser()
parser.process_document_version(self.document.latest_version)
self.assertTrue(
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
)
@override_settings(OCR_AUTO_OCR=False)
class TextExtractorTestCase(BaseTestCase):
def setUp(self):
super(TextExtractorTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
self.document = self.document_type.new_document(
file_object=File(file_object)
)
def tearDown(self):
self.document_type.delete()
super(TextExtractorTestCase, self).tearDown()
def test_text_extractor(self):
TextExtractor.process_document_version(
document_version=self.document.latest_version
)
self.assertEqual(
self.document.latest_version.pages.first().ocr_content.content,
'Sample text',
)
self.assertEqual(
self.document.latest_version.pages.last().ocr_content.content,
'Sample text in image form',
)

View File

@@ -0,0 +1,61 @@
from __future__ import unicode_literals
from django.test import override_settings
from documents.tests.test_views import GenericDocumentViewTestCase
from ..permissions import permission_ocr_content_view
from ..utils import get_document_ocr_content
@override_settings(OCR_AUTO_OCR=True)
class OCRViewsTestCase(GenericDocumentViewTestCase):
# PyOCR's leak descriptor in get_available_languages and image_to_string
# Disable descriptor leak test until fixed in upstream
_skip_file_descriptor_test = True
def setUp(self):
super(OCRViewsTestCase, self).setUp()
self.login_user()
def _document_content_view(self):
return self.get(
'ocr:document_content', args=(self.document.pk,)
)
def test_document_content_view_no_permissions(self):
response = self._document_content_view()
self.assertEqual(response.status_code, 403)
def test_document_content_view_with_permission(self):
self.grant_permission(permission=permission_ocr_content_view)
response = self._document_content_view()
self.assertContains(
response, 'Mayan EDMS Documentation', status_code=200
)
def test_document_ocr_download_view_no_permission(self):
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 403)
def test_document_download_view_with_permission(self):
self.expected_content_type = 'application/octet-stream; charset=utf-8'
self.grant_permission(permission=permission_ocr_content_view)
response = self.get(
'ocr:document_ocr_download', args=(self.document.pk,)
)
self.assertEqual(response.status_code, 200)
self.assert_download_response(
response, content=(
''.join(get_document_ocr_content(document=self.document))
),
)

View File

@@ -0,0 +1,65 @@
from __future__ import unicode_literals
from django.conf.urls import url
from .api_views import (
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
)
from .views import (
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
DocumentOCRErrorsListView, DocumentSubmitView, DocumentSubmitManyView,
DocumentTypeSettingsEditView, DocumentTypeSubmitView, EntryListView
)
urlpatterns = [
url(
r'^(?P<pk>\d+)/content/$', DocumentOCRContent.as_view(),
name='document_content'
),
url(
r'^document/(?P<pk>\d+)/submit/$', DocumentSubmitView.as_view(),
name='document_submit'
),
url(
r'^document/all/submit/$', DocumentAllSubmitView.as_view(),
name='document_submit_all'
),
url(
r'^document/type/submit/$', DocumentTypeSubmitView.as_view(),
name='document_type_submit'
),
url(
r'^document/multiple/submit/$', DocumentSubmitManyView.as_view(),
name='document_submit_multiple'
),
url(
r'^document_type/(?P<pk>\d+)/ocr/settings/$',
DocumentTypeSettingsEditView.as_view(),
name='document_type_ocr_settings'
),
url(
r'^documents/(?P<pk>\d+)/ocr/errors/$',
DocumentOCRErrorsListView.as_view(), name='document_ocr_error_list'
),
url(
r'^documents/(?P<pk>\d+)/ocr/download/$',
DocumentOCRDownloadView.as_view(), name='document_ocr_download'
),
url(r'^all/$', EntryListView.as_view(), name='entry_list'),
]
api_urls = [
url(
r'^document/(?P<pk>\d+)/submit/$', APIDocumentOCRView.as_view(),
name='document-ocr-submit-view'
),
url(
r'^document_version/(?P<pk>\d+)/submit/$',
APIDocumentVersionOCRView.as_view(),
name='document-version-ocr-submit-view'
),
url(
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
name='document-page-content-view'
),
]

View File

@@ -0,0 +1,16 @@
from __future__ import unicode_literals
from django.utils.encoding import force_text
from django.utils.html import conditional_escape
from .models import DocumentPageContent
def get_document_ocr_content(document):
for page in document.pages.all():
try:
page_content = page.ocr_content.content
except DocumentPageContent.DoesNotExist:
pass
else:
yield conditional_escape(force_text(page_content))

View File

@@ -0,0 +1,190 @@
from __future__ import absolute_import, unicode_literals
from django.contrib import messages
from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404
from django.urls import reverse
from django.utils.translation import ugettext_lazy as _
from acls.models import AccessControlList
from common.generics import (
ConfirmView, FormView, SingleObjectDetailView, SingleObjectDownloadView,
SingleObjectEditView, SingleObjectListView
)
from common.mixins import MultipleInstanceActionMixin
from documents.models import Document, DocumentType
from .forms import DocumentContentForm, DocumentTypeSelectForm
from .models import DocumentVersionOCRError
from .permissions import (
permission_ocr_content_view, permission_ocr_document,
permission_document_type_ocr_setup
)
from .utils import get_document_ocr_content
class DocumentAllSubmitView(ConfirmView):
extra_context = {'title': _('Submit all documents for OCR?')}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def view_action(self):
count = 0
for document in Document.objects.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _('%d documents added to the OCR queue.') % count
)
class DocumentSubmitView(ConfirmView):
def get_extra_context(self):
return {
'object': self.get_object(),
'title': _('Submit "%s" to the OCR queue?') % self.get_object()
}
def get_object(self):
return Document.objects.get(pk=self.kwargs['pk'])
def object_action(self, instance):
AccessControlList.objects.check_access(
permissions=permission_ocr_document, user=self.request.user,
obj=instance
)
instance.submit_for_ocr()
def view_action(self):
instance = self.get_object()
self.object_action(instance=instance)
messages.success(
self.request,
_('Document: %(document)s was added to the OCR queue.') % {
'document': instance
}
)
class DocumentSubmitManyView(MultipleInstanceActionMixin, DocumentSubmitView):
model = Document
success_message = '%(count)d document submitted to the OCR queue.'
success_message_plural = '%(count)d documents submitted to the OCR queue.'
def get_extra_context(self):
# Override the base class method
return {
'title': _('Submit the selected documents to the OCR queue?')
}
class DocumentTypeSubmitView(FormView):
form_class = DocumentTypeSelectForm
extra_context = {
'title': _('Submit all documents of a type for OCR')
}
def get_post_action_redirect(self):
return reverse('common:tools_list')
def form_valid(self, form):
count = 0
for document in form.cleaned_data['document_type'].documents.all():
document.submit_for_ocr()
count += 1
messages.success(
self.request, _(
'%(count)d documents of type "%(document_type)s" added to the '
'OCR queue.'
) % {
'count': count,
'document_type': form.cleaned_data['document_type']
}
)
return HttpResponseRedirect(self.get_success_url())
class DocumentTypeSettingsEditView(SingleObjectEditView):
fields = ('auto_ocr',)
view_permission = permission_document_type_ocr_setup
def get_object(self, queryset=None):
return get_object_or_404(
DocumentType, pk=self.kwargs['pk']
).ocr_settings
def get_extra_context(self):
return {
'title': _(
'Edit OCR settings for document type: %s'
) % self.get_object().document_type
}
class DocumentOCRContent(SingleObjectDetailView):
form_class = DocumentContentForm
model = Document
object_permission = permission_ocr_content_view
def dispatch(self, request, *args, **kwargs):
result = super(DocumentOCRContent, self).dispatch(
request, *args, **kwargs
)
self.get_object().add_as_recent_document_for_user(request.user)
return result
def get_extra_context(self):
return {
'document': self.get_object(),
'hide_labels': True,
'object': self.get_object(),
'title': _('OCR result for document: %s') % self.get_object(),
}
class EntryListView(SingleObjectListView):
extra_context = {
'hide_object': True,
'title': _('OCR errors'),
}
view_permission = permission_ocr_document
def get_object_list(self):
return DocumentVersionOCRError.objects.all()
class DocumentOCRErrorsListView(SingleObjectListView):
view_permission = permission_ocr_document
def get_document(self):
return get_object_or_404(Document, pk=self.kwargs['pk'])
def get_extra_context(self):
return {
'hide_object': True,
'object': self.get_document(),
'title': _('OCR errors for document: %s') % self.get_document(),
}
def get_object_list(self):
return self.get_document().latest_version.ocr_errors.all()
class DocumentOCRDownloadView(SingleObjectDownloadView):
model = Document
object_permission = permission_ocr_content_view
def get_file(self):
file_object = DocumentOCRDownloadView.TextIteratorIO(
iterator=get_document_ocr_content(document=self.get_object())
)
return DocumentOCRDownloadView.VirtualFile(
file=file_object, name='{}-OCR'.format(self.get_object())
)

View File

@@ -84,6 +84,7 @@ INSTALLED_APPS = (
'checkouts',
'document_comments',
'document_indexing',
'document_parsing',
'document_signatures',
'document_states',
'documents',