Refactor OCR app. Removes document parsing. Moves OCR processing to
model manager. Add submit and finish events. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -3,12 +3,12 @@ from __future__ import unicode_literals
|
|||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
|
|
||||||
from .models import (
|
from .models import (
|
||||||
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
|
DocumentPageOCRContent, DocumentTypeSettings, DocumentVersionOCRError
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@admin.register(DocumentPageContent)
|
@admin.register(DocumentPageOCRContent)
|
||||||
class DocumentPageContentAdmin(admin.ModelAdmin):
|
class DocumentPageOCRContentAdmin(admin.ModelAdmin):
|
||||||
list_display = ('document_page',)
|
list_display = ('document_page',)
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -6,9 +6,9 @@ from rest_framework.response import Response
|
|||||||
from documents.models import Document, DocumentPage, DocumentVersion
|
from documents.models import Document, DocumentPage, DocumentVersion
|
||||||
from rest_api.permissions import MayanPermission
|
from rest_api.permissions import MayanPermission
|
||||||
|
|
||||||
from .models import DocumentPageContent
|
from .models import DocumentPageOCRContent
|
||||||
from .permissions import permission_ocr_content_view, permission_ocr_document
|
from .permissions import permission_ocr_content_view, permission_ocr_document
|
||||||
from .serializers import DocumentPageContentSerializer
|
from .serializers import DocumentPageOCRContentSerializer
|
||||||
|
|
||||||
|
|
||||||
class APIDocumentOCRView(generics.GenericAPIView):
|
class APIDocumentOCRView(generics.GenericAPIView):
|
||||||
@@ -67,7 +67,7 @@ class APIDocumentVersionOCRView(generics.GenericAPIView):
|
|||||||
return Response(status=status.HTTP_202_ACCEPTED)
|
return Response(status=status.HTTP_202_ACCEPTED)
|
||||||
|
|
||||||
|
|
||||||
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
class APIDocumentPageOCRContentView(generics.RetrieveAPIView):
|
||||||
"""
|
"""
|
||||||
Returns the OCR content of the selected document page.
|
Returns the OCR content of the selected document page.
|
||||||
---
|
---
|
||||||
@@ -82,7 +82,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
|||||||
'GET': (permission_ocr_content_view,),
|
'GET': (permission_ocr_content_view,),
|
||||||
}
|
}
|
||||||
permission_classes = (MayanPermission,)
|
permission_classes = (MayanPermission,)
|
||||||
serializer_class = DocumentPageContentSerializer
|
serializer_class = DocumentPageOCRContentSerializer
|
||||||
queryset = DocumentPage.objects.all()
|
queryset = DocumentPage.objects.all()
|
||||||
|
|
||||||
def retrieve(self, request, *args, **kwargs):
|
def retrieve(self, request, *args, **kwargs):
|
||||||
@@ -90,8 +90,8 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
|||||||
|
|
||||||
try:
|
try:
|
||||||
ocr_content = instance.ocr_content
|
ocr_content = instance.ocr_content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentPageOCRContent.DoesNotExist:
|
||||||
ocr_content = DocumentPageContent.objects.none()
|
ocr_content = DocumentPageOCRContent.objects.none()
|
||||||
|
|
||||||
serializer = self.get_serializer(ocr_content)
|
serializer = self.get_serializer(ocr_content)
|
||||||
return Response(serializer.data)
|
return Response(serializer.data)
|
||||||
|
|||||||
@@ -1,11 +1,13 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from datetime import timedelta
|
||||||
import logging
|
import logging
|
||||||
|
|
||||||
from kombu import Exchange, Queue
|
from kombu import Exchange, Queue
|
||||||
|
|
||||||
from django.apps import apps
|
from django.apps import apps
|
||||||
from django.db.models.signals import post_save
|
from django.db.models.signals import post_save
|
||||||
|
from django.utils.timezone import now
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
from acls import ModelPermission
|
from acls import ModelPermission
|
||||||
@@ -21,7 +23,10 @@ from mayan.celery import app
|
|||||||
from navigation import SourceColumn
|
from navigation import SourceColumn
|
||||||
from rest_api.classes import APIEndPoint
|
from rest_api.classes import APIEndPoint
|
||||||
|
|
||||||
from .handlers import initialize_new_ocr_settings, post_version_upload_ocr
|
from .events import event_ocr_document_version_submit
|
||||||
|
from .handlers import (
|
||||||
|
handler_initialize_new_ocr_settings, handler_ocr_document_version,
|
||||||
|
)
|
||||||
from .links import (
|
from .links import (
|
||||||
link_document_content, link_document_ocr_download,
|
link_document_content, link_document_ocr_download,
|
||||||
link_document_ocr_erros_list, link_document_submit,
|
link_document_ocr_erros_list, link_document_submit,
|
||||||
@@ -36,17 +41,17 @@ logger = logging.getLogger(__name__)
|
|||||||
|
|
||||||
|
|
||||||
def document_ocr_submit(self):
|
def document_ocr_submit(self):
|
||||||
from .tasks import task_do_ocr
|
self.latest_version.submit_for_ocr()
|
||||||
|
|
||||||
task_do_ocr.apply_async(args=(self.latest_version.pk,))
|
|
||||||
|
|
||||||
|
|
||||||
def document_version_ocr_submit(self):
|
def document_version_ocr_submit(self):
|
||||||
from .tasks import task_do_ocr
|
from .tasks import task_do_ocr
|
||||||
|
|
||||||
|
event_ocr_document_version_submit.commit(target=self)
|
||||||
|
|
||||||
task_do_ocr.apply_async(
|
task_do_ocr.apply_async(
|
||||||
|
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
|
||||||
kwargs={'document_version_pk': self.pk},
|
kwargs={'document_version_pk': self.pk},
|
||||||
countdown=settings_db_sync_task_delay.value
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -155,10 +160,12 @@ class OCRApp(MayanAppConfig):
|
|||||||
)
|
)
|
||||||
|
|
||||||
post_save.connect(
|
post_save.connect(
|
||||||
initialize_new_ocr_settings,
|
dispatch_uid='ocr_handler_initialize_new_ocr_settings',
|
||||||
dispatch_uid='initialize_new_ocr_settings', sender=DocumentType
|
receiver=handler_initialize_new_ocr_settings,
|
||||||
|
sender=DocumentType
|
||||||
)
|
)
|
||||||
post_version_upload.connect(
|
post_version_upload.connect(
|
||||||
post_version_upload_ocr, dispatch_uid='post_version_upload_ocr',
|
dispatch_uid='ocr_handler_ocr_document_version',
|
||||||
|
receiver=handler_ocr_document_version,
|
||||||
sender=DocumentVersion
|
sender=DocumentVersion
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,80 +1,9 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from django.utils.module_loading import import_string
|
|
||||||
|
|
||||||
from converter import converter_class
|
from converter import converter_class
|
||||||
from documents.runtime import cache_storage_backend
|
|
||||||
|
|
||||||
from .exceptions import NoMIMETypeMatch, ParserError
|
|
||||||
from .models import DocumentPageContent
|
|
||||||
from .parsers import Parser
|
|
||||||
from .settings import setting_ocr_backend
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class TextExtractor(object):
|
|
||||||
@classmethod
|
|
||||||
def perform_ocr(cls, document_page):
|
|
||||||
ocr_backend_class = import_string(setting_ocr_backend.value)
|
|
||||||
backend = ocr_backend_class()
|
|
||||||
backend.process_document_page(document_page)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def process_document_page(cls, document_page):
|
|
||||||
"""
|
|
||||||
Extract text for a document version's page. Try parsing the page and if
|
|
||||||
no there are not parsers for the MIME type or the parser return nothing
|
|
||||||
fallback to doing and OCR of the page.
|
|
||||||
"""
|
|
||||||
|
|
||||||
try:
|
|
||||||
Parser.parse_document_page(document_page=document_page)
|
|
||||||
except (NoMIMETypeMatch, ParserError):
|
|
||||||
cls.perform_ocr(document_page=document_page)
|
|
||||||
else:
|
|
||||||
if not document_page.ocr_content.content:
|
|
||||||
cls.perform_ocr(document_page=document_page)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def process_document_version(cls, document_version):
|
|
||||||
for document_page in document_version.pages.all():
|
|
||||||
cls.process_document_page(document_page=document_page)
|
|
||||||
|
|
||||||
|
|
||||||
class OCRBackendBase(object):
|
class OCRBackendBase(object):
|
||||||
def process_document_version(self, document_version):
|
|
||||||
logger.info('Starting OCR for document version: %s', document_version)
|
|
||||||
logger.debug('document version: %d', document_version.pk)
|
|
||||||
|
|
||||||
for document_page in document_version.pages.all():
|
|
||||||
self.process_document_page(document_page=document_page)
|
|
||||||
|
|
||||||
def process_document_page(self, document_page):
|
|
||||||
logger.info(
|
|
||||||
'Processing page: %d of document version: %s',
|
|
||||||
document_page.page_number, document_page.document_version
|
|
||||||
)
|
|
||||||
|
|
||||||
cache_filename = document_page.generate_image()
|
|
||||||
|
|
||||||
with cache_storage_backend.open(cache_filename) as file_object:
|
|
||||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
|
||||||
document_page=document_page
|
|
||||||
)
|
|
||||||
document_page_content.content = self.execute(
|
|
||||||
file_object=file_object,
|
|
||||||
language=document_page.document.language
|
|
||||||
)
|
|
||||||
document_page_content.save()
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
'Finished processing page: %d of document version: %s',
|
|
||||||
document_page.page_number, document_page.document_version
|
|
||||||
)
|
|
||||||
|
|
||||||
def execute(self, file_object, language=None, transformations=None):
|
def execute(self, file_object, language=None, transformations=None):
|
||||||
self.language = language
|
self.language = language
|
||||||
|
|
||||||
|
|||||||
14
mayan/apps/ocr/events.py
Normal file
14
mayan/apps/ocr/events.py
Normal file
@@ -0,0 +1,14 @@
|
|||||||
|
from __future__ import absolute_import, unicode_literals
|
||||||
|
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
from events.classes import Event
|
||||||
|
|
||||||
|
event_ocr_document_version_submit = Event(
|
||||||
|
name='ocr_document_version_submit',
|
||||||
|
label=_('Document version submitted for OCR')
|
||||||
|
)
|
||||||
|
event_ocr_document_version_finish = Event(
|
||||||
|
name='ocr_document_version_finish',
|
||||||
|
label=_('Document version OCR finished')
|
||||||
|
)
|
||||||
@@ -6,17 +6,3 @@ class OCRError(Exception):
|
|||||||
Raised by the OCR backend
|
Raised by the OCR backend
|
||||||
"""
|
"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class ParserError(Exception):
|
|
||||||
"""
|
|
||||||
Base exception for file parsers
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class NoMIMETypeMatch(ParserError):
|
|
||||||
"""
|
|
||||||
There is no parser registered for the specified MIME type
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|||||||
@@ -9,17 +9,17 @@ from django.utils.translation import ugettext_lazy as _, ugettext
|
|||||||
from common.widgets import TextAreaDiv
|
from common.widgets import TextAreaDiv
|
||||||
from documents.models import DocumentType
|
from documents.models import DocumentType
|
||||||
|
|
||||||
from .models import DocumentPageContent
|
from .models import DocumentPageOCRContent
|
||||||
|
|
||||||
|
|
||||||
class DocumentContentForm(forms.Form):
|
class DocumentOCRContentForm(forms.Form):
|
||||||
"""
|
"""
|
||||||
Form that concatenates all of a document pages' text content into a
|
Form that concatenates all of a document pages' text content into a
|
||||||
single textarea widget
|
single textarea widget
|
||||||
"""
|
"""
|
||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
self.document = kwargs.pop('instance', None)
|
self.document = kwargs.pop('instance', None)
|
||||||
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
super(DocumentOCRContentForm, self).__init__(*args, **kwargs)
|
||||||
content = []
|
content = []
|
||||||
self.fields['contents'].initial = ''
|
self.fields['contents'].initial = ''
|
||||||
try:
|
try:
|
||||||
@@ -30,7 +30,7 @@ class DocumentContentForm(forms.Form):
|
|||||||
for page in document_pages:
|
for page in document_pages:
|
||||||
try:
|
try:
|
||||||
page_content = page.ocr_content.content
|
page_content = page.ocr_content.content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentPageOCRContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
content.append(conditional_escape(force_text(page_content)))
|
content.append(conditional_escape(force_text(page_content)))
|
||||||
|
|||||||
@@ -9,14 +9,7 @@ from .settings import setting_auto_ocr
|
|||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
def post_version_upload_ocr(sender, instance, **kwargs):
|
def handler_initialize_new_ocr_settings(sender, instance, **kwargs):
|
||||||
logger.debug('received post_version_upload')
|
|
||||||
logger.debug('instance pk: %s', instance.pk)
|
|
||||||
if instance.document.document_type.ocr_settings.auto_ocr:
|
|
||||||
instance.submit_for_ocr()
|
|
||||||
|
|
||||||
|
|
||||||
def initialize_new_ocr_settings(sender, instance, **kwargs):
|
|
||||||
DocumentTypeSettings = apps.get_model(
|
DocumentTypeSettings = apps.get_model(
|
||||||
app_label='ocr', model_name='DocumentTypeSettings'
|
app_label='ocr', model_name='DocumentTypeSettings'
|
||||||
)
|
)
|
||||||
@@ -25,3 +18,10 @@ def initialize_new_ocr_settings(sender, instance, **kwargs):
|
|||||||
DocumentTypeSettings.objects.create(
|
DocumentTypeSettings.objects.create(
|
||||||
document_type=instance, auto_ocr=setting_auto_ocr.value
|
document_type=instance, auto_ocr=setting_auto_ocr.value
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def handler_ocr_document_version(sender, instance, **kwargs):
|
||||||
|
logger.debug('received post_version_upload')
|
||||||
|
logger.debug('instance pk: %s', instance.pk)
|
||||||
|
if instance.document.document_type.ocr_settings.auto_ocr:
|
||||||
|
instance.submit_for_ocr()
|
||||||
|
|||||||
82
mayan/apps/ocr/managers.py
Normal file
82
mayan/apps/ocr/managers.py
Normal file
@@ -0,0 +1,82 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
import traceback
|
||||||
|
|
||||||
|
from django.apps import apps
|
||||||
|
from django.conf import settings
|
||||||
|
from django.db import models
|
||||||
|
|
||||||
|
from documents.runtime import cache_storage_backend
|
||||||
|
|
||||||
|
from .events import event_ocr_document_version_finish
|
||||||
|
from .runtime import ocr_backend
|
||||||
|
from .signals import post_document_version_ocr
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentPageOCRContentManager(models.Manager):
|
||||||
|
def process_document_version(self, document_version):
|
||||||
|
logger.info('Starting OCR for document version: %s', document_version)
|
||||||
|
logger.debug('document version: %d', document_version.pk)
|
||||||
|
|
||||||
|
try:
|
||||||
|
for document_page in document_version.pages.all():
|
||||||
|
self.process_document_page(document_page=document_page)
|
||||||
|
except Exception as exception:
|
||||||
|
logger.error(
|
||||||
|
'OCR error for document version: %d; %s', document_version,
|
||||||
|
exception
|
||||||
|
)
|
||||||
|
|
||||||
|
if settings.DEBUG:
|
||||||
|
result = []
|
||||||
|
type, value, tb = sys.exc_info()
|
||||||
|
result.append('%s: %s' % (type.__name__, value))
|
||||||
|
result.extend(traceback.format_tb(tb))
|
||||||
|
document_version.ocr_errors.create(
|
||||||
|
result='\n'.join(result)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
document_version.ocr_errors.create(result=exception)
|
||||||
|
else:
|
||||||
|
logger.info(
|
||||||
|
'OCR complete for document version: %s', document_version
|
||||||
|
)
|
||||||
|
document_version.ocr_errors.all().delete()
|
||||||
|
|
||||||
|
event_ocr_document_version_finish.commit(target=document_version)
|
||||||
|
|
||||||
|
post_document_version_ocr.send(
|
||||||
|
sender=document_version.__class__, instance=document_version
|
||||||
|
)
|
||||||
|
|
||||||
|
def process_document_page(self, document_page):
|
||||||
|
logger.info(
|
||||||
|
'Processing page: %d of document version: %s',
|
||||||
|
document_page.page_number, document_page.document_version
|
||||||
|
)
|
||||||
|
|
||||||
|
DocumentPageOCRContent = apps.get_model(
|
||||||
|
app_label='ocr', model_name='DocumentPageOCRContent'
|
||||||
|
)
|
||||||
|
|
||||||
|
# TODO: Call task and wait
|
||||||
|
cache_filename = document_page.generate_image()
|
||||||
|
|
||||||
|
with cache_storage_backend.open(cache_filename) as file_object:
|
||||||
|
document_page_content, created = DocumentPageOCRContent.objects.get_or_create(
|
||||||
|
document_page=document_page
|
||||||
|
)
|
||||||
|
document_page_content.content = ocr_backend.execute(
|
||||||
|
file_object=file_object,
|
||||||
|
language=document_page.document.language
|
||||||
|
)
|
||||||
|
document_page_content.save()
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
'Finished processing page: %d of document version: %s',
|
||||||
|
document_page.page_number, document_page.document_version
|
||||||
|
)
|
||||||
67
mayan/apps/ocr/migrations/0006_auto_20170823_0553.py
Normal file
67
mayan/apps/ocr/migrations/0006_auto_20170823_0553.py
Normal file
@@ -0,0 +1,67 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# Generated by Django 1.10.7 on 2017-08-23 05:53
|
||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
import django.db.models.deletion
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('documents', '0040_auto_20170725_1111'),
|
||||||
|
('ocr', '0005_auto_20170630_1846'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
migrations.CreateModel(
|
||||||
|
name='DocumentPageOCRContent',
|
||||||
|
fields=[
|
||||||
|
(
|
||||||
|
'id', models.AutoField(
|
||||||
|
auto_created=True, primary_key=True, serialize=False,
|
||||||
|
verbose_name='ID'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'content', models.TextField(
|
||||||
|
blank=True, verbose_name='Content'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
(
|
||||||
|
'document_page', models.OneToOneField(
|
||||||
|
on_delete=django.db.models.deletion.CASCADE,
|
||||||
|
related_name='ocr', to='documents.DocumentPage',
|
||||||
|
verbose_name='Document page'
|
||||||
|
)
|
||||||
|
),
|
||||||
|
],
|
||||||
|
options={
|
||||||
|
'verbose_name': 'Document page OCR content',
|
||||||
|
'verbose_name_plural': 'Document pages OCR contents',
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.RemoveField(
|
||||||
|
model_name='documentpagecontent',
|
||||||
|
name='document_page',
|
||||||
|
),
|
||||||
|
migrations.AlterModelOptions(
|
||||||
|
name='documentversionocrerror',
|
||||||
|
options={
|
||||||
|
'ordering': ('datetime_submitted',),
|
||||||
|
'verbose_name': 'Document version OCR error',
|
||||||
|
'verbose_name_plural': 'Document version OCR errors'
|
||||||
|
},
|
||||||
|
),
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='documentversionocrerror',
|
||||||
|
name='datetime_submitted',
|
||||||
|
field=models.DateTimeField(
|
||||||
|
auto_now_add=True, db_index=True,
|
||||||
|
verbose_name='Date time submitted'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
migrations.DeleteModel(
|
||||||
|
name='DocumentPageContent',
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -6,6 +6,8 @@ from django.utils.translation import ugettext_lazy as _
|
|||||||
|
|
||||||
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
||||||
|
|
||||||
|
from .managers import DocumentPageOCRContentManager
|
||||||
|
|
||||||
|
|
||||||
class DocumentTypeSettings(models.Model):
|
class DocumentTypeSettings(models.Model):
|
||||||
"""
|
"""
|
||||||
@@ -25,6 +27,24 @@ class DocumentTypeSettings(models.Model):
|
|||||||
verbose_name_plural = _('Document types settings')
|
verbose_name_plural = _('Document types settings')
|
||||||
|
|
||||||
|
|
||||||
|
@python_2_unicode_compatible
|
||||||
|
class DocumentPageOCRContent(models.Model):
|
||||||
|
document_page = models.OneToOneField(
|
||||||
|
DocumentPage, on_delete=models.CASCADE, related_name='ocr_content',
|
||||||
|
verbose_name=_('Document page')
|
||||||
|
)
|
||||||
|
content = models.TextField(blank=True, verbose_name=_('Content'))
|
||||||
|
|
||||||
|
objects = DocumentPageOCRContentManager()
|
||||||
|
|
||||||
|
def __str__(self):
|
||||||
|
return force_text(self.document_page)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = _('Document page OCR content')
|
||||||
|
verbose_name_plural = _('Document pages OCR contents')
|
||||||
|
|
||||||
|
|
||||||
@python_2_unicode_compatible
|
@python_2_unicode_compatible
|
||||||
class DocumentVersionOCRError(models.Model):
|
class DocumentVersionOCRError(models.Model):
|
||||||
document_version = models.ForeignKey(
|
document_version = models.ForeignKey(
|
||||||
@@ -32,7 +52,7 @@ class DocumentVersionOCRError(models.Model):
|
|||||||
verbose_name=_('Document version')
|
verbose_name=_('Document version')
|
||||||
)
|
)
|
||||||
datetime_submitted = models.DateTimeField(
|
datetime_submitted = models.DateTimeField(
|
||||||
auto_now=True, db_index=True, verbose_name=_('Date time submitted')
|
auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
|
||||||
)
|
)
|
||||||
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||||
|
|
||||||
@@ -41,24 +61,5 @@ class DocumentVersionOCRError(models.Model):
|
|||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
ordering = ('datetime_submitted',)
|
ordering = ('datetime_submitted',)
|
||||||
verbose_name = _('Document Version OCR Error')
|
verbose_name = _('Document version OCR error')
|
||||||
verbose_name_plural = _('Document Version OCR Errors')
|
verbose_name_plural = _('Document version OCR errors')
|
||||||
|
|
||||||
|
|
||||||
@python_2_unicode_compatible
|
|
||||||
class DocumentPageContent(models.Model):
|
|
||||||
"""
|
|
||||||
Model that describes a document page content
|
|
||||||
"""
|
|
||||||
document_page = models.OneToOneField(
|
|
||||||
DocumentPage, on_delete=models.CASCADE, related_name='ocr_content',
|
|
||||||
verbose_name=_('Document page')
|
|
||||||
)
|
|
||||||
content = models.TextField(blank=True, verbose_name=_('Content'))
|
|
||||||
|
|
||||||
def __str__(self):
|
|
||||||
return force_text(self.document_page)
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
verbose_name = _('Document page content')
|
|
||||||
verbose_name_plural = _('Document pages contents')
|
|
||||||
|
|||||||
@@ -1,202 +0,0 @@
|
|||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from io import BytesIO
|
|
||||||
import logging
|
|
||||||
import os
|
|
||||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
|
||||||
from pdfminer.pdfpage import PDFPage
|
|
||||||
from pdfminer.converter import TextConverter
|
|
||||||
from pdfminer.layout import LAParams
|
|
||||||
import subprocess
|
|
||||||
|
|
||||||
from django.utils.translation import ugettext_lazy as _
|
|
||||||
|
|
||||||
from common.utils import copyfile, fs_cleanup, mkstemp
|
|
||||||
|
|
||||||
from .exceptions import ParserError, NoMIMETypeMatch
|
|
||||||
from .models import DocumentPageContent
|
|
||||||
from .settings import setting_pdftotext_path
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class Parser(object):
|
|
||||||
"""
|
|
||||||
Parser base class
|
|
||||||
"""
|
|
||||||
|
|
||||||
_registry = {}
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def register(cls, mimetypes, parser_classes):
|
|
||||||
for mimetype in mimetypes:
|
|
||||||
for parser_class in parser_classes:
|
|
||||||
cls._registry.setdefault(
|
|
||||||
mimetype, []
|
|
||||||
).append(parser_class)
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def parse_document_version(cls, document_version):
|
|
||||||
try:
|
|
||||||
for parser_class in cls._registry[document_version.mimetype]:
|
|
||||||
try:
|
|
||||||
parser = parser_class()
|
|
||||||
parser.process_document_version(document_version)
|
|
||||||
except ParserError:
|
|
||||||
# If parser raises error, try next parser in the list
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# If parser was successfull there is no need to try
|
|
||||||
# others in the list for this mimetype
|
|
||||||
return
|
|
||||||
|
|
||||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
|
||||||
except KeyError:
|
|
||||||
raise NoMIMETypeMatch
|
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def parse_document_page(cls, document_page):
|
|
||||||
try:
|
|
||||||
for parser_class in cls._registry[document_page.document_version.mimetype]:
|
|
||||||
try:
|
|
||||||
parser = parser_class()
|
|
||||||
parser.process_document_page(document_page)
|
|
||||||
except ParserError:
|
|
||||||
# If parser raises error, try next parser in the list
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
# If parser was successfull there is no need to try
|
|
||||||
# others in the list for this mimetype
|
|
||||||
return
|
|
||||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
|
||||||
except KeyError:
|
|
||||||
raise NoMIMETypeMatch
|
|
||||||
|
|
||||||
def process_document_version(self, document_version):
|
|
||||||
logger.info(
|
|
||||||
'Starting parsing for document version: %s', document_version
|
|
||||||
)
|
|
||||||
logger.debug('document version: %d', document_version.pk)
|
|
||||||
|
|
||||||
for document_page in document_version.pages.all():
|
|
||||||
self.process_document_page(document_page=document_page)
|
|
||||||
|
|
||||||
def process_document_page(self, document_page):
|
|
||||||
logger.info(
|
|
||||||
'Processing page: %d of document version: %s',
|
|
||||||
document_page.page_number, document_page.document_version
|
|
||||||
)
|
|
||||||
|
|
||||||
file_object = document_page.document_version.get_intermidiate_file()
|
|
||||||
|
|
||||||
try:
|
|
||||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
|
||||||
document_page=document_page
|
|
||||||
)
|
|
||||||
document_page_content.content = self.execute(
|
|
||||||
file_object=file_object, page_number=document_page.page_number
|
|
||||||
)
|
|
||||||
document_page_content.save()
|
|
||||||
except Exception as exception:
|
|
||||||
error_message = _('Exception parsing page; %s') % exception
|
|
||||||
logger.error(error_message)
|
|
||||||
raise ParserError(error_message)
|
|
||||||
finally:
|
|
||||||
file_object.close()
|
|
||||||
|
|
||||||
logger.info(
|
|
||||||
'Finished processing page: %d of document version: %s',
|
|
||||||
document_page.page_number, document_page.document_version
|
|
||||||
)
|
|
||||||
|
|
||||||
def execute(self, file_object, page_number):
|
|
||||||
raise NotImplementedError(
|
|
||||||
'Your %s class has not defined the required execute() method.' %
|
|
||||||
self.__class__.__name__
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class PopplerParser(Parser):
|
|
||||||
"""
|
|
||||||
PDF parser using the pdftotext execute from the poppler package
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __init__(self):
|
|
||||||
self.pdftotext_path = setting_pdftotext_path.value
|
|
||||||
if not os.path.exists(self.pdftotext_path):
|
|
||||||
error_message = _(
|
|
||||||
'Cannot find pdftotext executable at: %s'
|
|
||||||
) % self.pdftotext_path
|
|
||||||
logger.error(error_message)
|
|
||||||
raise ParserError(error_message)
|
|
||||||
|
|
||||||
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
|
||||||
|
|
||||||
def execute(self, file_object, page_number):
|
|
||||||
logger.debug('Parsing PDF page: %d', page_number)
|
|
||||||
|
|
||||||
destination_descriptor, temp_filepath = mkstemp()
|
|
||||||
copyfile(file_object, temp_filepath)
|
|
||||||
|
|
||||||
command = []
|
|
||||||
command.append(self.pdftotext_path)
|
|
||||||
command.append('-f')
|
|
||||||
command.append(str(page_number))
|
|
||||||
command.append('-l')
|
|
||||||
command.append(str(page_number))
|
|
||||||
command.append(temp_filepath)
|
|
||||||
command.append('-')
|
|
||||||
|
|
||||||
proc = subprocess.Popen(
|
|
||||||
command, close_fds=True, stderr=subprocess.PIPE,
|
|
||||||
stdout=subprocess.PIPE
|
|
||||||
)
|
|
||||||
return_code = proc.wait()
|
|
||||||
if return_code != 0:
|
|
||||||
logger.error(proc.stderr.readline())
|
|
||||||
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
|
||||||
|
|
||||||
raise ParserError
|
|
||||||
|
|
||||||
output = proc.stdout.read()
|
|
||||||
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
|
||||||
|
|
||||||
if output == b'\x0c':
|
|
||||||
logger.debug('Parser didn\'t return any output')
|
|
||||||
return ''
|
|
||||||
|
|
||||||
if output[-3:] == b'\x0a\x0a\x0c':
|
|
||||||
return output[:-3]
|
|
||||||
|
|
||||||
return output
|
|
||||||
|
|
||||||
|
|
||||||
class PDFMinerParser(Parser):
|
|
||||||
"""
|
|
||||||
Parser for PDF files using the PDFMiner library for Python
|
|
||||||
"""
|
|
||||||
|
|
||||||
def execute(self, file_object, page_number):
|
|
||||||
logger.debug('Parsing PDF page: %d', page_number)
|
|
||||||
|
|
||||||
with BytesIO() as string_buffer:
|
|
||||||
rsrcmgr = PDFResourceManager()
|
|
||||||
device = TextConverter(
|
|
||||||
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
|
||||||
)
|
|
||||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
|
||||||
page = PDFPage.get_pages(
|
|
||||||
file_object, maxpages=1, pagenos=(page_number - 1,)
|
|
||||||
)
|
|
||||||
interpreter.process_page(page.next())
|
|
||||||
device.close()
|
|
||||||
|
|
||||||
logger.debug('Finished parsing PDF: %d', page_number)
|
|
||||||
|
|
||||||
return string_buffer.getvalue()
|
|
||||||
|
|
||||||
|
|
||||||
Parser.register(
|
|
||||||
mimetypes=('application/pdf',),
|
|
||||||
parser_classes=(PopplerParser, PDFMinerParser)
|
|
||||||
)
|
|
||||||
5
mayan/apps/ocr/runtime.py
Normal file
5
mayan/apps/ocr/runtime.py
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
from django.utils.module_loading import import_string
|
||||||
|
|
||||||
|
from .settings import setting_ocr_backend
|
||||||
|
|
||||||
|
ocr_backend = import_string(setting_ocr_backend.value)()
|
||||||
@@ -2,10 +2,10 @@ from __future__ import unicode_literals
|
|||||||
|
|
||||||
from rest_framework import serializers
|
from rest_framework import serializers
|
||||||
|
|
||||||
from .models import DocumentPageContent
|
from .models import DocumentPageOCRContent
|
||||||
|
|
||||||
|
|
||||||
class DocumentPageContentSerializer(serializers.ModelSerializer):
|
class DocumentPageOCRContentSerializer(serializers.ModelSerializer):
|
||||||
class Meta:
|
class Meta:
|
||||||
fields = ('content',)
|
fields = ('content',)
|
||||||
model = DocumentPageContent
|
model = DocumentPageOCRContent
|
||||||
|
|||||||
@@ -1,84 +1,53 @@
|
|||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import sys
|
|
||||||
import traceback
|
|
||||||
|
|
||||||
from django.conf import settings
|
from django.apps import apps
|
||||||
from django.db import OperationalError
|
from django.db import OperationalError
|
||||||
|
|
||||||
from documents.models import DocumentVersion
|
|
||||||
from lock_manager import LockError
|
from lock_manager import LockError
|
||||||
from lock_manager.runtime import locking_backend
|
from lock_manager.runtime import locking_backend
|
||||||
from mayan.celery import app
|
from mayan.celery import app
|
||||||
|
|
||||||
from .classes import TextExtractor
|
|
||||||
from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
|
from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
|
||||||
from .models import DocumentVersionOCRError
|
|
||||||
from .signals import post_document_version_ocr
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
@app.task(bind=True, default_retry_delay=DO_OCR_RETRY_DELAY, ignore_result=True)
|
@app.task(bind=True, default_retry_delay=DO_OCR_RETRY_DELAY, ignore_result=True)
|
||||||
def task_do_ocr(self, document_version_pk):
|
def task_do_ocr(self, document_version_pk):
|
||||||
|
DocumentVersion = apps.get_model(
|
||||||
|
app_label='documents', model_name='DocumentVersion'
|
||||||
|
)
|
||||||
|
DocumentPageOCRContent = apps.get_model(
|
||||||
|
app_label='ocr', model_name='DocumentPageOCRContent'
|
||||||
|
)
|
||||||
|
|
||||||
lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk
|
lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk
|
||||||
try:
|
try:
|
||||||
logger.debug('trying to acquire lock: %s', lock_id)
|
logger.debug('trying to acquire lock: %s', lock_id)
|
||||||
# Acquire lock to avoid doing OCR on the same document version more than
|
# Acquire lock to avoid doing OCR on the same document version more
|
||||||
# once concurrently
|
# than once concurrently
|
||||||
lock = locking_backend.acquire_lock(lock_id, LOCK_EXPIRE)
|
lock = locking_backend.acquire_lock(lock_id, LOCK_EXPIRE)
|
||||||
logger.debug('acquired lock: %s', lock_id)
|
logger.debug('acquired lock: %s', lock_id)
|
||||||
document_version = None
|
document_version = None
|
||||||
try:
|
try:
|
||||||
document_version = DocumentVersion.objects.get(pk=document_version_pk)
|
document_version = DocumentVersion.objects.get(
|
||||||
|
pk=document_version_pk
|
||||||
|
)
|
||||||
logger.info(
|
logger.info(
|
||||||
'Starting document OCR for document version: %s',
|
'Starting document OCR for document version: %s',
|
||||||
document_version
|
document_version
|
||||||
)
|
)
|
||||||
TextExtractor.process_document_version(document_version)
|
DocumentPageOCRContent.objects.process_document_version(
|
||||||
|
document_version=document_version
|
||||||
|
)
|
||||||
except OperationalError as exception:
|
except OperationalError as exception:
|
||||||
logger.warning(
|
logger.warning(
|
||||||
'OCR error for document version: %d; %s. Retrying.',
|
'OCR error for document version: %d; %s. Retrying.',
|
||||||
document_version_pk, exception
|
document_version_pk, exception
|
||||||
)
|
)
|
||||||
raise self.retry(exc=exception)
|
raise self.retry(exc=exception)
|
||||||
except Exception as exception:
|
|
||||||
logger.error(
|
|
||||||
'OCR error for document version: %d; %s', document_version_pk,
|
|
||||||
exception
|
|
||||||
)
|
|
||||||
if document_version:
|
|
||||||
entry, created = DocumentVersionOCRError.objects.get_or_create(
|
|
||||||
document_version=document_version
|
|
||||||
)
|
|
||||||
|
|
||||||
if settings.DEBUG:
|
|
||||||
result = []
|
|
||||||
type, value, tb = sys.exc_info()
|
|
||||||
result.append('%s: %s' % (type.__name__, value))
|
|
||||||
result.extend(traceback.format_tb(tb))
|
|
||||||
entry.result = '\n'.join(result)
|
|
||||||
else:
|
|
||||||
entry.result = exception
|
|
||||||
|
|
||||||
entry.save()
|
|
||||||
else:
|
|
||||||
logger.info(
|
|
||||||
'OCR complete for document version: %s', document_version
|
|
||||||
)
|
|
||||||
try:
|
|
||||||
entry = DocumentVersionOCRError.objects.get(
|
|
||||||
document_version=document_version
|
|
||||||
)
|
|
||||||
except DocumentVersionOCRError.DoesNotExist:
|
|
||||||
pass
|
|
||||||
else:
|
|
||||||
entry.delete()
|
|
||||||
|
|
||||||
post_document_version_ocr.send(
|
|
||||||
sender=self, instance=document_version
|
|
||||||
)
|
|
||||||
finally:
|
finally:
|
||||||
lock.release()
|
lock.release()
|
||||||
except LockError:
|
except LockError:
|
||||||
|
|||||||
35
mayan/apps/ocr/tests/test_events.py
Normal file
35
mayan/apps/ocr/tests/test_events.py
Normal file
@@ -0,0 +1,35 @@
|
|||||||
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
from actstream.models import Action
|
||||||
|
|
||||||
|
from documents.tests.test_models import GenericDocumentTestCase
|
||||||
|
|
||||||
|
from ..events import (
|
||||||
|
event_ocr_document_version_submit, event_ocr_document_version_finish
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class OCREventsTestCase(GenericDocumentTestCase):
|
||||||
|
def test_document_version_submit_event(self):
|
||||||
|
Action.objects.all().delete()
|
||||||
|
self.document.submit_for_ocr()
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.last().target, self.document.latest_version
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.last().verb,
|
||||||
|
event_ocr_document_version_submit.name
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_document_version_finish_event(self):
|
||||||
|
Action.objects.all().delete()
|
||||||
|
self.document.submit_for_ocr()
|
||||||
|
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.first().target, self.document.latest_version
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
Action.objects.first().verb,
|
||||||
|
event_ocr_document_version_finish.name
|
||||||
|
)
|
||||||
@@ -1,83 +0,0 @@
|
|||||||
from __future__ import unicode_literals
|
|
||||||
|
|
||||||
from django.core.files.base import File
|
|
||||||
from django.test import override_settings
|
|
||||||
|
|
||||||
from common.tests import BaseTestCase
|
|
||||||
from documents.models import DocumentType
|
|
||||||
from documents.tests import (
|
|
||||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
|
||||||
)
|
|
||||||
|
|
||||||
from ..classes import TextExtractor
|
|
||||||
from ..parsers import PDFMinerParser, PopplerParser
|
|
||||||
|
|
||||||
|
|
||||||
@override_settings(OCR_AUTO_OCR=False)
|
|
||||||
class ParserTestCase(BaseTestCase):
|
|
||||||
def setUp(self):
|
|
||||||
super(ParserTestCase, self).setUp()
|
|
||||||
self.document_type = DocumentType.objects.create(
|
|
||||||
label=TEST_DOCUMENT_TYPE_LABEL
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(TEST_DOCUMENT_PATH) as file_object:
|
|
||||||
self.document = self.document_type.new_document(
|
|
||||||
file_object=File(file_object)
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
self.document_type.delete()
|
|
||||||
super(ParserTestCase, self).tearDown()
|
|
||||||
|
|
||||||
def test_pdfminer_parser(self):
|
|
||||||
parser = PDFMinerParser()
|
|
||||||
|
|
||||||
parser.process_document_version(self.document.latest_version)
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_poppler_parser(self):
|
|
||||||
parser = PopplerParser()
|
|
||||||
|
|
||||||
parser.process_document_version(self.document.latest_version)
|
|
||||||
|
|
||||||
self.assertTrue(
|
|
||||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@override_settings(OCR_AUTO_OCR=False)
|
|
||||||
class TextExtractorTestCase(BaseTestCase):
|
|
||||||
def setUp(self):
|
|
||||||
super(TextExtractorTestCase, self).setUp()
|
|
||||||
|
|
||||||
self.document_type = DocumentType.objects.create(
|
|
||||||
label=TEST_DOCUMENT_TYPE_LABEL
|
|
||||||
)
|
|
||||||
|
|
||||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
|
||||||
self.document = self.document_type.new_document(
|
|
||||||
file_object=File(file_object)
|
|
||||||
)
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
self.document_type.delete()
|
|
||||||
super(TextExtractorTestCase, self).tearDown()
|
|
||||||
|
|
||||||
def test_text_extractor(self):
|
|
||||||
TextExtractor.process_document_version(
|
|
||||||
document_version=self.document.latest_version
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
self.document.latest_version.pages.first().ocr_content.content,
|
|
||||||
'Sample text',
|
|
||||||
)
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
self.document.latest_version.pages.last().ocr_content.content,
|
|
||||||
'Sample text in image form',
|
|
||||||
)
|
|
||||||
@@ -3,7 +3,8 @@ from __future__ import unicode_literals
|
|||||||
from django.conf.urls import url
|
from django.conf.urls import url
|
||||||
|
|
||||||
from .api_views import (
|
from .api_views import (
|
||||||
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
|
APIDocumentOCRView, APIDocumentPageOCRContentView,
|
||||||
|
APIDocumentVersionOCRView
|
||||||
)
|
)
|
||||||
from .views import (
|
from .views import (
|
||||||
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
||||||
@@ -59,7 +60,8 @@ api_urls = [
|
|||||||
name='document-version-ocr-submit-view'
|
name='document-version-ocr-submit-view'
|
||||||
),
|
),
|
||||||
url(
|
url(
|
||||||
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
r'^page/(?P<pk>\d+)/content/$',
|
||||||
|
APIDocumentPageOCRContentView.as_view(),
|
||||||
name='document-page-content-view'
|
name='document-page-content-view'
|
||||||
),
|
),
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -3,14 +3,14 @@ from __future__ import unicode_literals
|
|||||||
from django.utils.encoding import force_text
|
from django.utils.encoding import force_text
|
||||||
from django.utils.html import conditional_escape
|
from django.utils.html import conditional_escape
|
||||||
|
|
||||||
from .models import DocumentPageContent
|
from .models import DocumentPageOCRContent
|
||||||
|
|
||||||
|
|
||||||
def get_document_ocr_content(document):
|
def get_document_ocr_content(document):
|
||||||
for page in document.pages.all():
|
for page in document.pages.all():
|
||||||
try:
|
try:
|
||||||
page_content = page.ocr_content.content
|
page_content = page.ocr_content.content
|
||||||
except DocumentPageContent.DoesNotExist:
|
except DocumentPageOCRContent.DoesNotExist:
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
yield conditional_escape(force_text(page_content))
|
yield conditional_escape(force_text(page_content))
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ from common.generics import (
|
|||||||
from common.mixins import MultipleInstanceActionMixin
|
from common.mixins import MultipleInstanceActionMixin
|
||||||
from documents.models import Document, DocumentType
|
from documents.models import Document, DocumentType
|
||||||
|
|
||||||
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
from .forms import DocumentOCRContentForm, DocumentTypeSelectForm
|
||||||
from .models import DocumentVersionOCRError
|
from .models import DocumentVersionOCRError
|
||||||
from .permissions import (
|
from .permissions import (
|
||||||
permission_ocr_content_view, permission_ocr_document,
|
permission_ocr_content_view, permission_ocr_document,
|
||||||
@@ -40,6 +40,27 @@ class DocumentAllSubmitView(ConfirmView):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentOCRContent(SingleObjectDetailView):
|
||||||
|
form_class = DocumentOCRContentForm
|
||||||
|
model = Document
|
||||||
|
object_permission = permission_ocr_content_view
|
||||||
|
|
||||||
|
def dispatch(self, request, *args, **kwargs):
|
||||||
|
result = super(DocumentOCRContent, self).dispatch(
|
||||||
|
request, *args, **kwargs
|
||||||
|
)
|
||||||
|
self.get_object().add_as_recent_document_for_user(request.user)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def get_extra_context(self):
|
||||||
|
return {
|
||||||
|
'document': self.get_object(),
|
||||||
|
'hide_labels': True,
|
||||||
|
'object': self.get_object(),
|
||||||
|
'title': _('OCR result for document: %s') % self.get_object(),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class DocumentSubmitView(ConfirmView):
|
class DocumentSubmitView(ConfirmView):
|
||||||
def get_extra_context(self):
|
def get_extra_context(self):
|
||||||
return {
|
return {
|
||||||
@@ -128,27 +149,6 @@ class DocumentTypeSettingsEditView(SingleObjectEditView):
|
|||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
class DocumentOCRContent(SingleObjectDetailView):
|
|
||||||
form_class = DocumentContentForm
|
|
||||||
model = Document
|
|
||||||
object_permission = permission_ocr_content_view
|
|
||||||
|
|
||||||
def dispatch(self, request, *args, **kwargs):
|
|
||||||
result = super(DocumentOCRContent, self).dispatch(
|
|
||||||
request, *args, **kwargs
|
|
||||||
)
|
|
||||||
self.get_object().add_as_recent_document_for_user(request.user)
|
|
||||||
return result
|
|
||||||
|
|
||||||
def get_extra_context(self):
|
|
||||||
return {
|
|
||||||
'document': self.get_object(),
|
|
||||||
'hide_labels': True,
|
|
||||||
'object': self.get_object(),
|
|
||||||
'title': _('OCR result for document: %s') % self.get_object(),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
class EntryListView(SingleObjectListView):
|
class EntryListView(SingleObjectListView):
|
||||||
extra_context = {
|
extra_context = {
|
||||||
'hide_object': True,
|
'hide_object': True,
|
||||||
|
|||||||
Reference in New Issue
Block a user