Files
mayan-edms/mayan/apps/ocr/apps.py

111 lines
4.9 KiB
Python

from __future__ import unicode_literals
import logging
import sh
from django.db.models.signals import post_save
from django.utils.translation import ugettext_lazy as _
from acls import ModelPermission
from common import (
MayanAppConfig, menu_facet, menu_multi_item, menu_object, menu_secondary,
menu_tools
)
from common.utils import encapsulate
from documents.models import Document, DocumentType, DocumentVersion
from documents.search import document_search
from documents.signals import post_version_upload
from documents.widgets import document_link
from installation import PropertyNamespace
from navigation import SourceColumn
from rest_api.classes import APIEndPoint
from .handlers import initialize_new_ocr_settings, post_version_upload_ocr
from .links import (
link_document_content, link_document_submit,
link_document_submit_multiple, link_document_type_ocr_settings,
link_entry_delete, link_entry_delete_multiple, link_entry_list,
link_entry_re_queue, link_entry_re_queue_multiple
)
from .models import DocumentVersionOCRError
from .permissions import permission_ocr_document, permission_ocr_content_view
from .settings import setting_pdftotext_path, setting_tesseract_path, setting_unpaper_path
from .tasks import task_do_ocr
logger = logging.getLogger(__name__)
def document_ocr_submit(self):
task_do_ocr.apply_async(args=[self.latest_version.pk], queue='ocr')
def document_version_ocr_submit(self):
task_do_ocr.apply_async(args=[self.pk], queue='ocr')
class OCRApp(MayanAppConfig):
name = 'ocr'
verbose_name = _('OCR')
def ready(self):
super(OCRApp, self).ready()
APIEndPoint('ocr')
Document.add_to_class('submit_for_ocr', document_ocr_submit)
DocumentVersion.add_to_class('submit_for_ocr', document_version_ocr_submit)
ModelPermission.register(
model=Document, permissions=(
permission_ocr_document, permission_ocr_content_view
)
)
SourceColumn(source=DocumentVersionOCRError, label=_('Document'), attribute=encapsulate(lambda entry: document_link(entry.document_version.document)))
SourceColumn(source=DocumentVersionOCRError, label=_('Added'), attribute='datetime_submitted')
SourceColumn(source=DocumentVersionOCRError, label=_('Result'), attribute='result')
document_search.add_model_field(field='versions__pages__ocr_content__content', label=_('Content'))
menu_facet.bind_links(links=[link_document_content], sources=[Document])
menu_multi_item.bind_links(links=[link_document_submit_multiple], sources=[Document])
menu_multi_item.bind_links(links=[link_entry_re_queue_multiple, link_entry_delete_multiple], sources=[DocumentVersionOCRError])
menu_object.bind_links(links=[link_document_submit], sources=[Document])
menu_object.bind_links(links=[link_entry_re_queue, link_entry_delete], sources=[DocumentVersionOCRError])
menu_object.bind_links(links=[link_document_type_ocr_settings], sources=[DocumentType])
menu_secondary.bind_links(links=[link_entry_list], sources=['ocr:entry_list', 'ocr:entry_delete_multiple', 'ocr:entry_re_queue_multiple', DocumentVersionOCRError])
menu_tools.bind_links(links=[link_entry_list])
post_save.connect(initialize_new_ocr_settings, dispatch_uid='initialize_new_ocr_settings', sender=DocumentType)
post_version_upload.connect(post_version_upload_ocr, dispatch_uid='post_version_upload_ocr', sender=DocumentVersion)
namespace = PropertyNamespace('ocr', _('OCR'))
try:
pdftotext = sh.Command(setting_pdftotext_path.value)
except sh.CommandNotFound:
namespace.add_property('pdftotext', _('pdftotext version'), _('not found'), report=True)
except Exception:
namespace.add_property('pdftotext', _('pdftotext version'), _('error getting version'), report=True)
else:
namespace.add_property('pdftotext', _('pdftotext version'), pdftotext('-v').stderr, report=True)
try:
tesseract = sh.Command(setting_tesseract_path.value)
except sh.CommandNotFound:
namespace.add_property('tesseract', _('tesseract version'), _('not found'), report=True)
except Exception:
namespace.add_property('tesseract', _('tesseract version'), _('error getting version'), report=True)
else:
namespace.add_property('tesseract', _('tesseract version'), tesseract('-v').stderr, report=True)
try:
unpaper = sh.Command(setting_unpaper_path.value)
except sh.CommandNotFound:
namespace.add_property('unpaper', _('unpaper version'), _('not found'), report=True)
except Exception:
namespace.add_property('unpaper', _('unpaper version'), _('error getting version'), report=True)
else:
namespace.add_property('unpaper', _('unpaper version'), unpaper('-V').stdout, report=True)