Refactor OCR app. Removes document parsing. Moves OCR processing to
model manager. Add submit and finish events. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -3,12 +3,12 @@ from __future__ import unicode_literals
|
||||
from django.contrib import admin
|
||||
|
||||
from .models import (
|
||||
DocumentPageContent, DocumentTypeSettings, DocumentVersionOCRError
|
||||
DocumentPageOCRContent, DocumentTypeSettings, DocumentVersionOCRError
|
||||
)
|
||||
|
||||
|
||||
@admin.register(DocumentPageContent)
|
||||
class DocumentPageContentAdmin(admin.ModelAdmin):
|
||||
@admin.register(DocumentPageOCRContent)
|
||||
class DocumentPageOCRContentAdmin(admin.ModelAdmin):
|
||||
list_display = ('document_page',)
|
||||
|
||||
|
||||
|
||||
@@ -6,9 +6,9 @@ from rest_framework.response import Response
|
||||
from documents.models import Document, DocumentPage, DocumentVersion
|
||||
from rest_api.permissions import MayanPermission
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .models import DocumentPageOCRContent
|
||||
from .permissions import permission_ocr_content_view, permission_ocr_document
|
||||
from .serializers import DocumentPageContentSerializer
|
||||
from .serializers import DocumentPageOCRContentSerializer
|
||||
|
||||
|
||||
class APIDocumentOCRView(generics.GenericAPIView):
|
||||
@@ -67,7 +67,7 @@ class APIDocumentVersionOCRView(generics.GenericAPIView):
|
||||
return Response(status=status.HTTP_202_ACCEPTED)
|
||||
|
||||
|
||||
class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
class APIDocumentPageOCRContentView(generics.RetrieveAPIView):
|
||||
"""
|
||||
Returns the OCR content of the selected document page.
|
||||
---
|
||||
@@ -82,7 +82,7 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
'GET': (permission_ocr_content_view,),
|
||||
}
|
||||
permission_classes = (MayanPermission,)
|
||||
serializer_class = DocumentPageContentSerializer
|
||||
serializer_class = DocumentPageOCRContentSerializer
|
||||
queryset = DocumentPage.objects.all()
|
||||
|
||||
def retrieve(self, request, *args, **kwargs):
|
||||
@@ -90,8 +90,8 @@ class APIDocumentPageContentView(generics.RetrieveAPIView):
|
||||
|
||||
try:
|
||||
ocr_content = instance.ocr_content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
ocr_content = DocumentPageContent.objects.none()
|
||||
except DocumentPageOCRContent.DoesNotExist:
|
||||
ocr_content = DocumentPageOCRContent.objects.none()
|
||||
|
||||
serializer = self.get_serializer(ocr_content)
|
||||
return Response(serializer.data)
|
||||
|
||||
@@ -1,11 +1,13 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from datetime import timedelta
|
||||
import logging
|
||||
|
||||
from kombu import Exchange, Queue
|
||||
|
||||
from django.apps import apps
|
||||
from django.db.models.signals import post_save
|
||||
from django.utils.timezone import now
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from acls import ModelPermission
|
||||
@@ -21,7 +23,10 @@ from mayan.celery import app
|
||||
from navigation import SourceColumn
|
||||
from rest_api.classes import APIEndPoint
|
||||
|
||||
from .handlers import initialize_new_ocr_settings, post_version_upload_ocr
|
||||
from .events import event_ocr_document_version_submit
|
||||
from .handlers import (
|
||||
handler_initialize_new_ocr_settings, handler_ocr_document_version,
|
||||
)
|
||||
from .links import (
|
||||
link_document_content, link_document_ocr_download,
|
||||
link_document_ocr_erros_list, link_document_submit,
|
||||
@@ -36,17 +41,17 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def document_ocr_submit(self):
|
||||
from .tasks import task_do_ocr
|
||||
|
||||
task_do_ocr.apply_async(args=(self.latest_version.pk,))
|
||||
self.latest_version.submit_for_ocr()
|
||||
|
||||
|
||||
def document_version_ocr_submit(self):
|
||||
from .tasks import task_do_ocr
|
||||
|
||||
event_ocr_document_version_submit.commit(target=self)
|
||||
|
||||
task_do_ocr.apply_async(
|
||||
eta=now() + timedelta(seconds=settings_db_sync_task_delay.value),
|
||||
kwargs={'document_version_pk': self.pk},
|
||||
countdown=settings_db_sync_task_delay.value
|
||||
)
|
||||
|
||||
|
||||
@@ -155,10 +160,12 @@ class OCRApp(MayanAppConfig):
|
||||
)
|
||||
|
||||
post_save.connect(
|
||||
initialize_new_ocr_settings,
|
||||
dispatch_uid='initialize_new_ocr_settings', sender=DocumentType
|
||||
dispatch_uid='ocr_handler_initialize_new_ocr_settings',
|
||||
receiver=handler_initialize_new_ocr_settings,
|
||||
sender=DocumentType
|
||||
)
|
||||
post_version_upload.connect(
|
||||
post_version_upload_ocr, dispatch_uid='post_version_upload_ocr',
|
||||
dispatch_uid='ocr_handler_ocr_document_version',
|
||||
receiver=handler_ocr_document_version,
|
||||
sender=DocumentVersion
|
||||
)
|
||||
|
||||
@@ -1,80 +1,9 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
from converter import converter_class
|
||||
from documents.runtime import cache_storage_backend
|
||||
|
||||
from .exceptions import NoMIMETypeMatch, ParserError
|
||||
from .models import DocumentPageContent
|
||||
from .parsers import Parser
|
||||
from .settings import setting_ocr_backend
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextExtractor(object):
|
||||
@classmethod
|
||||
def perform_ocr(cls, document_page):
|
||||
ocr_backend_class = import_string(setting_ocr_backend.value)
|
||||
backend = ocr_backend_class()
|
||||
backend.process_document_page(document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_page(cls, document_page):
|
||||
"""
|
||||
Extract text for a document version's page. Try parsing the page and if
|
||||
no there are not parsers for the MIME type or the parser return nothing
|
||||
fallback to doing and OCR of the page.
|
||||
"""
|
||||
|
||||
try:
|
||||
Parser.parse_document_page(document_page=document_page)
|
||||
except (NoMIMETypeMatch, ParserError):
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
else:
|
||||
if not document_page.ocr_content.content:
|
||||
cls.perform_ocr(document_page=document_page)
|
||||
|
||||
@classmethod
|
||||
def process_document_version(cls, document_version):
|
||||
for document_page in document_version.pages.all():
|
||||
cls.process_document_page(document_page=document_page)
|
||||
|
||||
|
||||
class OCRBackendBase(object):
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting OCR for document version: %s', document_version)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
cache_filename = document_page.generate_image()
|
||||
|
||||
with cache_storage_backend.open(cache_filename) as file_object:
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=file_object,
|
||||
language=document_page.document.language
|
||||
)
|
||||
document_page_content.save()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
def execute(self, file_object, language=None, transformations=None):
|
||||
self.language = language
|
||||
|
||||
|
||||
14
mayan/apps/ocr/events.py
Normal file
14
mayan/apps/ocr/events.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from __future__ import absolute_import, unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from events.classes import Event
|
||||
|
||||
event_ocr_document_version_submit = Event(
|
||||
name='ocr_document_version_submit',
|
||||
label=_('Document version submitted for OCR')
|
||||
)
|
||||
event_ocr_document_version_finish = Event(
|
||||
name='ocr_document_version_finish',
|
||||
label=_('Document version OCR finished')
|
||||
)
|
||||
@@ -6,17 +6,3 @@ class OCRError(Exception):
|
||||
Raised by the OCR backend
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ParserError(Exception):
|
||||
"""
|
||||
Base exception for file parsers
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class NoMIMETypeMatch(ParserError):
|
||||
"""
|
||||
There is no parser registered for the specified MIME type
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -9,17 +9,17 @@ from django.utils.translation import ugettext_lazy as _, ugettext
|
||||
from common.widgets import TextAreaDiv
|
||||
from documents.models import DocumentType
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .models import DocumentPageOCRContent
|
||||
|
||||
|
||||
class DocumentContentForm(forms.Form):
|
||||
class DocumentOCRContentForm(forms.Form):
|
||||
"""
|
||||
Form that concatenates all of a document pages' text content into a
|
||||
single textarea widget
|
||||
"""
|
||||
def __init__(self, *args, **kwargs):
|
||||
self.document = kwargs.pop('instance', None)
|
||||
super(DocumentContentForm, self).__init__(*args, **kwargs)
|
||||
super(DocumentOCRContentForm, self).__init__(*args, **kwargs)
|
||||
content = []
|
||||
self.fields['contents'].initial = ''
|
||||
try:
|
||||
@@ -30,7 +30,7 @@ class DocumentContentForm(forms.Form):
|
||||
for page in document_pages:
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
except DocumentPageOCRContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
content.append(conditional_escape(force_text(page_content)))
|
||||
|
||||
@@ -9,14 +9,7 @@ from .settings import setting_auto_ocr
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def post_version_upload_ocr(sender, instance, **kwargs):
|
||||
logger.debug('received post_version_upload')
|
||||
logger.debug('instance pk: %s', instance.pk)
|
||||
if instance.document.document_type.ocr_settings.auto_ocr:
|
||||
instance.submit_for_ocr()
|
||||
|
||||
|
||||
def initialize_new_ocr_settings(sender, instance, **kwargs):
|
||||
def handler_initialize_new_ocr_settings(sender, instance, **kwargs):
|
||||
DocumentTypeSettings = apps.get_model(
|
||||
app_label='ocr', model_name='DocumentTypeSettings'
|
||||
)
|
||||
@@ -25,3 +18,10 @@ def initialize_new_ocr_settings(sender, instance, **kwargs):
|
||||
DocumentTypeSettings.objects.create(
|
||||
document_type=instance, auto_ocr=setting_auto_ocr.value
|
||||
)
|
||||
|
||||
|
||||
def handler_ocr_document_version(sender, instance, **kwargs):
|
||||
logger.debug('received post_version_upload')
|
||||
logger.debug('instance pk: %s', instance.pk)
|
||||
if instance.document.document_type.ocr_settings.auto_ocr:
|
||||
instance.submit_for_ocr()
|
||||
|
||||
82
mayan/apps/ocr/managers.py
Normal file
82
mayan/apps/ocr/managers.py
Normal file
@@ -0,0 +1,82 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from django.apps import apps
|
||||
from django.conf import settings
|
||||
from django.db import models
|
||||
|
||||
from documents.runtime import cache_storage_backend
|
||||
|
||||
from .events import event_ocr_document_version_finish
|
||||
from .runtime import ocr_backend
|
||||
from .signals import post_document_version_ocr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class DocumentPageOCRContentManager(models.Manager):
|
||||
def process_document_version(self, document_version):
|
||||
logger.info('Starting OCR for document version: %s', document_version)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
try:
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page)
|
||||
except Exception as exception:
|
||||
logger.error(
|
||||
'OCR error for document version: %d; %s', document_version,
|
||||
exception
|
||||
)
|
||||
|
||||
if settings.DEBUG:
|
||||
result = []
|
||||
type, value, tb = sys.exc_info()
|
||||
result.append('%s: %s' % (type.__name__, value))
|
||||
result.extend(traceback.format_tb(tb))
|
||||
document_version.ocr_errors.create(
|
||||
result='\n'.join(result)
|
||||
)
|
||||
else:
|
||||
document_version.ocr_errors.create(result=exception)
|
||||
else:
|
||||
logger.info(
|
||||
'OCR complete for document version: %s', document_version
|
||||
)
|
||||
document_version.ocr_errors.all().delete()
|
||||
|
||||
event_ocr_document_version_finish.commit(target=document_version)
|
||||
|
||||
post_document_version_ocr.send(
|
||||
sender=document_version.__class__, instance=document_version
|
||||
)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
DocumentPageOCRContent = apps.get_model(
|
||||
app_label='ocr', model_name='DocumentPageOCRContent'
|
||||
)
|
||||
|
||||
# TODO: Call task and wait
|
||||
cache_filename = document_page.generate_image()
|
||||
|
||||
with cache_storage_backend.open(cache_filename) as file_object:
|
||||
document_page_content, created = DocumentPageOCRContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = ocr_backend.execute(
|
||||
file_object=file_object,
|
||||
language=document_page.document.language
|
||||
)
|
||||
document_page_content.save()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
67
mayan/apps/ocr/migrations/0006_auto_20170823_0553.py
Normal file
67
mayan/apps/ocr/migrations/0006_auto_20170823_0553.py
Normal file
@@ -0,0 +1,67 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
# Generated by Django 1.10.7 on 2017-08-23 05:53
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('documents', '0040_auto_20170725_1111'),
|
||||
('ocr', '0005_auto_20170630_1846'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='DocumentPageOCRContent',
|
||||
fields=[
|
||||
(
|
||||
'id', models.AutoField(
|
||||
auto_created=True, primary_key=True, serialize=False,
|
||||
verbose_name='ID'
|
||||
)
|
||||
),
|
||||
(
|
||||
'content', models.TextField(
|
||||
blank=True, verbose_name='Content'
|
||||
)
|
||||
),
|
||||
(
|
||||
'document_page', models.OneToOneField(
|
||||
on_delete=django.db.models.deletion.CASCADE,
|
||||
related_name='ocr', to='documents.DocumentPage',
|
||||
verbose_name='Document page'
|
||||
)
|
||||
),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Document page OCR content',
|
||||
'verbose_name_plural': 'Document pages OCR contents',
|
||||
},
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='documentpagecontent',
|
||||
name='document_page',
|
||||
),
|
||||
migrations.AlterModelOptions(
|
||||
name='documentversionocrerror',
|
||||
options={
|
||||
'ordering': ('datetime_submitted',),
|
||||
'verbose_name': 'Document version OCR error',
|
||||
'verbose_name_plural': 'Document version OCR errors'
|
||||
},
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='documentversionocrerror',
|
||||
name='datetime_submitted',
|
||||
field=models.DateTimeField(
|
||||
auto_now_add=True, db_index=True,
|
||||
verbose_name='Date time submitted'
|
||||
),
|
||||
),
|
||||
migrations.DeleteModel(
|
||||
name='DocumentPageContent',
|
||||
),
|
||||
]
|
||||
@@ -6,6 +6,8 @@ from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from documents.models import DocumentPage, DocumentType, DocumentVersion
|
||||
|
||||
from .managers import DocumentPageOCRContentManager
|
||||
|
||||
|
||||
class DocumentTypeSettings(models.Model):
|
||||
"""
|
||||
@@ -25,6 +27,24 @@ class DocumentTypeSettings(models.Model):
|
||||
verbose_name_plural = _('Document types settings')
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class DocumentPageOCRContent(models.Model):
|
||||
document_page = models.OneToOneField(
|
||||
DocumentPage, on_delete=models.CASCADE, related_name='ocr_content',
|
||||
verbose_name=_('Document page')
|
||||
)
|
||||
content = models.TextField(blank=True, verbose_name=_('Content'))
|
||||
|
||||
objects = DocumentPageOCRContentManager()
|
||||
|
||||
def __str__(self):
|
||||
return force_text(self.document_page)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _('Document page OCR content')
|
||||
verbose_name_plural = _('Document pages OCR contents')
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class DocumentVersionOCRError(models.Model):
|
||||
document_version = models.ForeignKey(
|
||||
@@ -32,7 +52,7 @@ class DocumentVersionOCRError(models.Model):
|
||||
verbose_name=_('Document version')
|
||||
)
|
||||
datetime_submitted = models.DateTimeField(
|
||||
auto_now=True, db_index=True, verbose_name=_('Date time submitted')
|
||||
auto_now_add=True, db_index=True, verbose_name=_('Date time submitted')
|
||||
)
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
|
||||
|
||||
@@ -41,24 +61,5 @@ class DocumentVersionOCRError(models.Model):
|
||||
|
||||
class Meta:
|
||||
ordering = ('datetime_submitted',)
|
||||
verbose_name = _('Document Version OCR Error')
|
||||
verbose_name_plural = _('Document Version OCR Errors')
|
||||
|
||||
|
||||
@python_2_unicode_compatible
|
||||
class DocumentPageContent(models.Model):
|
||||
"""
|
||||
Model that describes a document page content
|
||||
"""
|
||||
document_page = models.OneToOneField(
|
||||
DocumentPage, on_delete=models.CASCADE, related_name='ocr_content',
|
||||
verbose_name=_('Document page')
|
||||
)
|
||||
content = models.TextField(blank=True, verbose_name=_('Content'))
|
||||
|
||||
def __str__(self):
|
||||
return force_text(self.document_page)
|
||||
|
||||
class Meta:
|
||||
verbose_name = _('Document page content')
|
||||
verbose_name_plural = _('Document pages contents')
|
||||
verbose_name = _('Document version OCR error')
|
||||
verbose_name_plural = _('Document version OCR errors')
|
||||
|
||||
@@ -1,202 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from io import BytesIO
|
||||
import logging
|
||||
import os
|
||||
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.converter import TextConverter
|
||||
from pdfminer.layout import LAParams
|
||||
import subprocess
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from common.utils import copyfile, fs_cleanup, mkstemp
|
||||
|
||||
from .exceptions import ParserError, NoMIMETypeMatch
|
||||
from .models import DocumentPageContent
|
||||
from .settings import setting_pdftotext_path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class Parser(object):
|
||||
"""
|
||||
Parser base class
|
||||
"""
|
||||
|
||||
_registry = {}
|
||||
|
||||
@classmethod
|
||||
def register(cls, mimetypes, parser_classes):
|
||||
for mimetype in mimetypes:
|
||||
for parser_class in parser_classes:
|
||||
cls._registry.setdefault(
|
||||
mimetype, []
|
||||
).append(parser_class)
|
||||
|
||||
@classmethod
|
||||
def parse_document_version(cls, document_version):
|
||||
try:
|
||||
for parser_class in cls._registry[document_version.mimetype]:
|
||||
try:
|
||||
parser = parser_class()
|
||||
parser.process_document_version(document_version)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
|
||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
@classmethod
|
||||
def parse_document_page(cls, document_page):
|
||||
try:
|
||||
for parser_class in cls._registry[document_page.document_version.mimetype]:
|
||||
try:
|
||||
parser = parser_class()
|
||||
parser.process_document_page(document_page)
|
||||
except ParserError:
|
||||
# If parser raises error, try next parser in the list
|
||||
pass
|
||||
else:
|
||||
# If parser was successfull there is no need to try
|
||||
# others in the list for this mimetype
|
||||
return
|
||||
raise NoMIMETypeMatch('Parser MIME type list exhausted')
|
||||
except KeyError:
|
||||
raise NoMIMETypeMatch
|
||||
|
||||
def process_document_version(self, document_version):
|
||||
logger.info(
|
||||
'Starting parsing for document version: %s', document_version
|
||||
)
|
||||
logger.debug('document version: %d', document_version.pk)
|
||||
|
||||
for document_page in document_version.pages.all():
|
||||
self.process_document_page(document_page=document_page)
|
||||
|
||||
def process_document_page(self, document_page):
|
||||
logger.info(
|
||||
'Processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
file_object = document_page.document_version.get_intermidiate_file()
|
||||
|
||||
try:
|
||||
document_page_content, created = DocumentPageContent.objects.get_or_create(
|
||||
document_page=document_page
|
||||
)
|
||||
document_page_content.content = self.execute(
|
||||
file_object=file_object, page_number=document_page.page_number
|
||||
)
|
||||
document_page_content.save()
|
||||
except Exception as exception:
|
||||
error_message = _('Exception parsing page; %s') % exception
|
||||
logger.error(error_message)
|
||||
raise ParserError(error_message)
|
||||
finally:
|
||||
file_object.close()
|
||||
|
||||
logger.info(
|
||||
'Finished processing page: %d of document version: %s',
|
||||
document_page.page_number, document_page.document_version
|
||||
)
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
raise NotImplementedError(
|
||||
'Your %s class has not defined the required execute() method.' %
|
||||
self.__class__.__name__
|
||||
)
|
||||
|
||||
|
||||
class PopplerParser(Parser):
|
||||
"""
|
||||
PDF parser using the pdftotext execute from the poppler package
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdftotext_path = setting_pdftotext_path.value
|
||||
if not os.path.exists(self.pdftotext_path):
|
||||
error_message = _(
|
||||
'Cannot find pdftotext executable at: %s'
|
||||
) % self.pdftotext_path
|
||||
logger.error(error_message)
|
||||
raise ParserError(error_message)
|
||||
|
||||
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
destination_descriptor, temp_filepath = mkstemp()
|
||||
copyfile(file_object, temp_filepath)
|
||||
|
||||
command = []
|
||||
command.append(self.pdftotext_path)
|
||||
command.append('-f')
|
||||
command.append(str(page_number))
|
||||
command.append('-l')
|
||||
command.append(str(page_number))
|
||||
command.append(temp_filepath)
|
||||
command.append('-')
|
||||
|
||||
proc = subprocess.Popen(
|
||||
command, close_fds=True, stderr=subprocess.PIPE,
|
||||
stdout=subprocess.PIPE
|
||||
)
|
||||
return_code = proc.wait()
|
||||
if return_code != 0:
|
||||
logger.error(proc.stderr.readline())
|
||||
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
||||
|
||||
raise ParserError
|
||||
|
||||
output = proc.stdout.read()
|
||||
fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)
|
||||
|
||||
if output == b'\x0c':
|
||||
logger.debug('Parser didn\'t return any output')
|
||||
return ''
|
||||
|
||||
if output[-3:] == b'\x0a\x0a\x0c':
|
||||
return output[:-3]
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class PDFMinerParser(Parser):
|
||||
"""
|
||||
Parser for PDF files using the PDFMiner library for Python
|
||||
"""
|
||||
|
||||
def execute(self, file_object, page_number):
|
||||
logger.debug('Parsing PDF page: %d', page_number)
|
||||
|
||||
with BytesIO() as string_buffer:
|
||||
rsrcmgr = PDFResourceManager()
|
||||
device = TextConverter(
|
||||
rsrcmgr, outfp=string_buffer, laparams=LAParams()
|
||||
)
|
||||
interpreter = PDFPageInterpreter(rsrcmgr, device)
|
||||
page = PDFPage.get_pages(
|
||||
file_object, maxpages=1, pagenos=(page_number - 1,)
|
||||
)
|
||||
interpreter.process_page(page.next())
|
||||
device.close()
|
||||
|
||||
logger.debug('Finished parsing PDF: %d', page_number)
|
||||
|
||||
return string_buffer.getvalue()
|
||||
|
||||
|
||||
Parser.register(
|
||||
mimetypes=('application/pdf',),
|
||||
parser_classes=(PopplerParser, PDFMinerParser)
|
||||
)
|
||||
5
mayan/apps/ocr/runtime.py
Normal file
5
mayan/apps/ocr/runtime.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from django.utils.module_loading import import_string
|
||||
|
||||
from .settings import setting_ocr_backend
|
||||
|
||||
ocr_backend = import_string(setting_ocr_backend.value)()
|
||||
@@ -2,10 +2,10 @@ from __future__ import unicode_literals
|
||||
|
||||
from rest_framework import serializers
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .models import DocumentPageOCRContent
|
||||
|
||||
|
||||
class DocumentPageContentSerializer(serializers.ModelSerializer):
|
||||
class DocumentPageOCRContentSerializer(serializers.ModelSerializer):
|
||||
class Meta:
|
||||
fields = ('content',)
|
||||
model = DocumentPageContent
|
||||
model = DocumentPageOCRContent
|
||||
|
||||
@@ -1,84 +1,53 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import logging
|
||||
import sys
|
||||
import traceback
|
||||
|
||||
from django.conf import settings
|
||||
from django.apps import apps
|
||||
from django.db import OperationalError
|
||||
|
||||
from documents.models import DocumentVersion
|
||||
from lock_manager import LockError
|
||||
from lock_manager.runtime import locking_backend
|
||||
from mayan.celery import app
|
||||
|
||||
from .classes import TextExtractor
|
||||
from .literals import DO_OCR_RETRY_DELAY, LOCK_EXPIRE
|
||||
from .models import DocumentVersionOCRError
|
||||
from .signals import post_document_version_ocr
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@app.task(bind=True, default_retry_delay=DO_OCR_RETRY_DELAY, ignore_result=True)
|
||||
def task_do_ocr(self, document_version_pk):
|
||||
DocumentVersion = apps.get_model(
|
||||
app_label='documents', model_name='DocumentVersion'
|
||||
)
|
||||
DocumentPageOCRContent = apps.get_model(
|
||||
app_label='ocr', model_name='DocumentPageOCRContent'
|
||||
)
|
||||
|
||||
lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk
|
||||
try:
|
||||
logger.debug('trying to acquire lock: %s', lock_id)
|
||||
# Acquire lock to avoid doing OCR on the same document version more than
|
||||
# once concurrently
|
||||
# Acquire lock to avoid doing OCR on the same document version more
|
||||
# than once concurrently
|
||||
lock = locking_backend.acquire_lock(lock_id, LOCK_EXPIRE)
|
||||
logger.debug('acquired lock: %s', lock_id)
|
||||
document_version = None
|
||||
try:
|
||||
document_version = DocumentVersion.objects.get(pk=document_version_pk)
|
||||
document_version = DocumentVersion.objects.get(
|
||||
pk=document_version_pk
|
||||
)
|
||||
logger.info(
|
||||
'Starting document OCR for document version: %s',
|
||||
document_version
|
||||
)
|
||||
TextExtractor.process_document_version(document_version)
|
||||
DocumentPageOCRContent.objects.process_document_version(
|
||||
document_version=document_version
|
||||
)
|
||||
except OperationalError as exception:
|
||||
logger.warning(
|
||||
'OCR error for document version: %d; %s. Retrying.',
|
||||
document_version_pk, exception
|
||||
)
|
||||
raise self.retry(exc=exception)
|
||||
except Exception as exception:
|
||||
logger.error(
|
||||
'OCR error for document version: %d; %s', document_version_pk,
|
||||
exception
|
||||
)
|
||||
if document_version:
|
||||
entry, created = DocumentVersionOCRError.objects.get_or_create(
|
||||
document_version=document_version
|
||||
)
|
||||
|
||||
if settings.DEBUG:
|
||||
result = []
|
||||
type, value, tb = sys.exc_info()
|
||||
result.append('%s: %s' % (type.__name__, value))
|
||||
result.extend(traceback.format_tb(tb))
|
||||
entry.result = '\n'.join(result)
|
||||
else:
|
||||
entry.result = exception
|
||||
|
||||
entry.save()
|
||||
else:
|
||||
logger.info(
|
||||
'OCR complete for document version: %s', document_version
|
||||
)
|
||||
try:
|
||||
entry = DocumentVersionOCRError.objects.get(
|
||||
document_version=document_version
|
||||
)
|
||||
except DocumentVersionOCRError.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
entry.delete()
|
||||
|
||||
post_document_version_ocr.send(
|
||||
sender=self, instance=document_version
|
||||
)
|
||||
finally:
|
||||
lock.release()
|
||||
except LockError:
|
||||
|
||||
35
mayan/apps/ocr/tests/test_events.py
Normal file
35
mayan/apps/ocr/tests/test_events.py
Normal file
@@ -0,0 +1,35 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from actstream.models import Action
|
||||
|
||||
from documents.tests.test_models import GenericDocumentTestCase
|
||||
|
||||
from ..events import (
|
||||
event_ocr_document_version_submit, event_ocr_document_version_finish
|
||||
)
|
||||
|
||||
|
||||
class OCREventsTestCase(GenericDocumentTestCase):
|
||||
def test_document_version_submit_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.last().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.last().verb,
|
||||
event_ocr_document_version_submit.name
|
||||
)
|
||||
|
||||
def test_document_version_finish_event(self):
|
||||
Action.objects.all().delete()
|
||||
self.document.submit_for_ocr()
|
||||
|
||||
self.assertEqual(
|
||||
Action.objects.first().target, self.document.latest_version
|
||||
)
|
||||
self.assertEqual(
|
||||
Action.objects.first().verb,
|
||||
event_ocr_document_version_finish.name
|
||||
)
|
||||
@@ -1,83 +0,0 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.core.files.base import File
|
||||
from django.test import override_settings
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import DocumentType
|
||||
from documents.tests import (
|
||||
TEST_DOCUMENT_PATH, TEST_DOCUMENT_TYPE_LABEL, TEST_HYBRID_DOCUMENT_PATH
|
||||
)
|
||||
|
||||
from ..classes import TextExtractor
|
||||
from ..parsers import PDFMinerParser, PopplerParser
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class ParserTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(ParserTestCase, self).setUp()
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(ParserTestCase, self).tearDown()
|
||||
|
||||
def test_pdfminer_parser(self):
|
||||
parser = PDFMinerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
def test_poppler_parser(self):
|
||||
parser = PopplerParser()
|
||||
|
||||
parser.process_document_version(self.document.latest_version)
|
||||
|
||||
self.assertTrue(
|
||||
'Mayan EDMS Documentation' in self.document.pages.first().ocr_content.content
|
||||
)
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
class TextExtractorTestCase(BaseTestCase):
|
||||
def setUp(self):
|
||||
super(TextExtractorTestCase, self).setUp()
|
||||
|
||||
self.document_type = DocumentType.objects.create(
|
||||
label=TEST_DOCUMENT_TYPE_LABEL
|
||||
)
|
||||
|
||||
with open(TEST_HYBRID_DOCUMENT_PATH) as file_object:
|
||||
self.document = self.document_type.new_document(
|
||||
file_object=File(file_object)
|
||||
)
|
||||
|
||||
def tearDown(self):
|
||||
self.document_type.delete()
|
||||
super(TextExtractorTestCase, self).tearDown()
|
||||
|
||||
def test_text_extractor(self):
|
||||
TextExtractor.process_document_version(
|
||||
document_version=self.document.latest_version
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.first().ocr_content.content,
|
||||
'Sample text',
|
||||
)
|
||||
|
||||
self.assertEqual(
|
||||
self.document.latest_version.pages.last().ocr_content.content,
|
||||
'Sample text in image form',
|
||||
)
|
||||
@@ -3,7 +3,8 @@ from __future__ import unicode_literals
|
||||
from django.conf.urls import url
|
||||
|
||||
from .api_views import (
|
||||
APIDocumentOCRView, APIDocumentPageContentView, APIDocumentVersionOCRView
|
||||
APIDocumentOCRView, APIDocumentPageOCRContentView,
|
||||
APIDocumentVersionOCRView
|
||||
)
|
||||
from .views import (
|
||||
DocumentAllSubmitView, DocumentOCRContent, DocumentOCRDownloadView,
|
||||
@@ -59,7 +60,8 @@ api_urls = [
|
||||
name='document-version-ocr-submit-view'
|
||||
),
|
||||
url(
|
||||
r'^page/(?P<pk>\d+)/content/$', APIDocumentPageContentView.as_view(),
|
||||
r'^page/(?P<pk>\d+)/content/$',
|
||||
APIDocumentPageOCRContentView.as_view(),
|
||||
name='document-page-content-view'
|
||||
),
|
||||
]
|
||||
|
||||
@@ -3,14 +3,14 @@ from __future__ import unicode_literals
|
||||
from django.utils.encoding import force_text
|
||||
from django.utils.html import conditional_escape
|
||||
|
||||
from .models import DocumentPageContent
|
||||
from .models import DocumentPageOCRContent
|
||||
|
||||
|
||||
def get_document_ocr_content(document):
|
||||
for page in document.pages.all():
|
||||
try:
|
||||
page_content = page.ocr_content.content
|
||||
except DocumentPageContent.DoesNotExist:
|
||||
except DocumentPageOCRContent.DoesNotExist:
|
||||
pass
|
||||
else:
|
||||
yield conditional_escape(force_text(page_content))
|
||||
|
||||
@@ -14,7 +14,7 @@ from common.generics import (
|
||||
from common.mixins import MultipleInstanceActionMixin
|
||||
from documents.models import Document, DocumentType
|
||||
|
||||
from .forms import DocumentContentForm, DocumentTypeSelectForm
|
||||
from .forms import DocumentOCRContentForm, DocumentTypeSelectForm
|
||||
from .models import DocumentVersionOCRError
|
||||
from .permissions import (
|
||||
permission_ocr_content_view, permission_ocr_document,
|
||||
@@ -40,6 +40,27 @@ class DocumentAllSubmitView(ConfirmView):
|
||||
)
|
||||
|
||||
|
||||
class DocumentOCRContent(SingleObjectDetailView):
|
||||
form_class = DocumentOCRContentForm
|
||||
model = Document
|
||||
object_permission = permission_ocr_content_view
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
result = super(DocumentOCRContent, self).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
self.get_object().add_as_recent_document_for_user(request.user)
|
||||
return result
|
||||
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'document': self.get_object(),
|
||||
'hide_labels': True,
|
||||
'object': self.get_object(),
|
||||
'title': _('OCR result for document: %s') % self.get_object(),
|
||||
}
|
||||
|
||||
|
||||
class DocumentSubmitView(ConfirmView):
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
@@ -128,27 +149,6 @@ class DocumentTypeSettingsEditView(SingleObjectEditView):
|
||||
}
|
||||
|
||||
|
||||
class DocumentOCRContent(SingleObjectDetailView):
|
||||
form_class = DocumentContentForm
|
||||
model = Document
|
||||
object_permission = permission_ocr_content_view
|
||||
|
||||
def dispatch(self, request, *args, **kwargs):
|
||||
result = super(DocumentOCRContent, self).dispatch(
|
||||
request, *args, **kwargs
|
||||
)
|
||||
self.get_object().add_as_recent_document_for_user(request.user)
|
||||
return result
|
||||
|
||||
def get_extra_context(self):
|
||||
return {
|
||||
'document': self.get_object(),
|
||||
'hide_labels': True,
|
||||
'object': self.get_object(),
|
||||
'title': _('OCR result for document: %s') % self.get_object(),
|
||||
}
|
||||
|
||||
|
||||
class EntryListView(SingleObjectListView):
|
||||
extra_context = {
|
||||
'hide_object': True,
|
||||
|
||||
Reference in New Issue
Block a user