Update and re-enable ocr app
This commit is contained in:
@@ -1,75 +1 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
|
||||
from django.db import transaction
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext
|
||||
from django.db.models.signals import post_save, post_syncdb
|
||||
from django.dispatch import receiver
|
||||
from django.db.utils import DatabaseError
|
||||
|
||||
#from navigation.api import (bind_links, register_multi_item_links,
|
||||
# register_multi_item_links)
|
||||
#from documents.models import Document, DocumentVersion
|
||||
from maintenance.api import MaintenanceNamespace
|
||||
from project_tools.api import register_tool
|
||||
from acls.api import class_permissions
|
||||
from job_processor.models import JobQueue, JobType
|
||||
from job_processor.exceptions import JobQueuePushError
|
||||
|
||||
#from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL)
|
||||
#from .models import OCRProcessingSingleton
|
||||
#from .api import do_document_ocr
|
||||
from .permissions import PERMISSION_OCR_DOCUMENT
|
||||
from .exceptions import AlreadyQueued
|
||||
from . import models as ocr_models
|
||||
from .literals import OCR_QUEUE_NAME
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
ocr_job_queue = None
|
||||
|
||||
from .links import (submit_document, ocr_disable,
|
||||
ocr_enable, all_document_ocr_cleanup, ocr_log,
|
||||
ocr_tool_link, submit_document_multiple)
|
||||
|
||||
bind_links([Document], [submit_document])
|
||||
bind_links([OCRProcessingSingleton], [ocr_disable, ocr_enable])
|
||||
|
||||
namespace = MaintenanceNamespace(label=_(u'OCR'))
|
||||
namespace.create_tool(all_document_ocr_cleanup)
|
||||
register_multi_item_links(['folder_view', 'search', 'results', 'index_instance_node_view', 'document_find_duplicates', 'document_type_document_list', 'document_group_view', 'document_list', 'document_list_recent'], [submit_document_multiple])
|
||||
|
||||
|
||||
@transaction.commit_on_success
|
||||
def create_ocr_job_queue():
|
||||
global ocr_job_queue
|
||||
try:
|
||||
ocr_job_queue, created = JobQueue.objects.get_or_create(name=OCR_QUEUE_NAME, defaults={'label': _('OCR'), 'unique_jobs': True})
|
||||
except DatabaseError:
|
||||
transaction.rollback()
|
||||
|
||||
|
||||
@receiver(post_save, dispatch_uid='document_post_save', sender=DocumentVersion)
|
||||
def document_post_save(sender, instance, **kwargs):
|
||||
logger.debug('received post save signal')
|
||||
logger.debug('instance: %s' % instance)
|
||||
if kwargs.get('created', False):
|
||||
#if AUTOMATIC_OCR:
|
||||
try:
|
||||
instance.submit_for_ocr()
|
||||
except JobQueuePushError:
|
||||
pass
|
||||
|
||||
|
||||
register_tool(ocr_tool_link)
|
||||
|
||||
class_permissions(Document, [
|
||||
PERMISSION_OCR_DOCUMENT,
|
||||
])
|
||||
|
||||
create_ocr_job_queue()
|
||||
ocr_job_type = JobType('ocr', _(u'OCR'), do_document_ocr)
|
||||
|
||||
Document.add_to_class('submit_for_ocr', lambda document: ocr_job_queue.push(ocr_job_type, document_version_pk=document.latest_version.pk))
|
||||
DocumentVersion.add_to_class('submit_for_ocr', lambda document_version: ocr_job_queue.push(ocr_job_type, document_version_pk=document_version.pk))
|
||||
|
||||
@@ -10,11 +10,11 @@ import sys
|
||||
from django.utils.translation import ugettext as _
|
||||
from django.utils.importlib import import_module
|
||||
|
||||
#from common.settings import TEMPORARY_DIRECTORY
|
||||
#from converter.api import convert
|
||||
from common.settings import TEMPORARY_DIRECTORY
|
||||
from converter.api import convert
|
||||
from documents.models import DocumentPage, DocumentVersion
|
||||
|
||||
#from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
|
||||
from .settings import TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH
|
||||
from .exceptions import TesseractError, UnpaperError
|
||||
from .parsers import parse_document_page
|
||||
from .parsers.exceptions import ParserError, ParserUnknownFile
|
||||
|
||||
@@ -1,10 +1,3 @@
|
||||
class AlreadyQueued(Exception):
|
||||
"""
|
||||
Raised when a trying to queue document already in the queue
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class TesseractError(Exception):
|
||||
"""
|
||||
Raised by tesseract
|
||||
@@ -17,15 +10,3 @@ class UnpaperError(Exception):
|
||||
Raised by unpaper
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class ReQueueError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class OCRProcessingAlreadyDisabled(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class OCRProcessingAlreadyEnabled(Exception):
|
||||
pass
|
||||
|
||||
@@ -7,22 +7,9 @@ from navigation import Link
|
||||
from .permissions import (PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE,
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES)
|
||||
from .models import OCRProcessingSingleton
|
||||
from .icons import icon_submit_document, icon_ocr_cleanup
|
||||
|
||||
def is_enabled(context):
|
||||
return OCRProcessingSingleton.get().is_enabled()
|
||||
|
||||
def is_disabled(context):
|
||||
return not OCRProcessingSingleton.get().is_enabled()
|
||||
|
||||
|
||||
#ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
#ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled)
|
||||
#ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled)
|
||||
|
||||
submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
|
||||
|
||||
all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', icon=icon_ocr_cleanup, permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.'))
|
||||
|
||||
#ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue'])
|
||||
|
||||
@@ -1,16 +1,12 @@
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
|
||||
OCR_STATE_DISABLED = 'd'
|
||||
OCR_STATE_ENABLED = 'e'
|
||||
|
||||
OCR_STATE_CHOICES = (
|
||||
(OCR_STATE_DISABLED, _(u'disabled')),
|
||||
(OCR_STATE_ENABLED, _(u'enabled')),
|
||||
)
|
||||
|
||||
DEFAULT_OCR_FILE_FORMAT = u'tiff'
|
||||
DEFAULT_OCR_FILE_EXTENSION = u'tif'
|
||||
UNPAPER_FILE_FORMAT = u'ppm'
|
||||
|
||||
OCR_QUEUE_NAME = 'ocr'
|
||||
|
||||
DEFAULT_TESSERACT_PATH = u'/usr/bin/tesseract'
|
||||
DEFAULT_UNPAPER_PATH = u'/usr/bin/unpaper'
|
||||
DEFAULT_PDFTOTEXT_PATH = u'/usr/bin/pdftotext'
|
||||
DEFAULT_TESSERACT_LANGUAGE = u'eng'
|
||||
DEFAULT_REPLICATION_DELAY = 0
|
||||
|
||||
@@ -1,54 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from ast import literal_eval
|
||||
import datetime
|
||||
|
||||
from django.db import models
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext
|
||||
from django.core.exceptions import ObjectDoesNotExist
|
||||
from django.contrib.contenttypes.models import ContentType
|
||||
from django.contrib.contenttypes import generic
|
||||
from django.core.exceptions import ValidationError
|
||||
|
||||
from common.models import Singleton
|
||||
from documents.models import Document, DocumentVersion
|
||||
from converter.api import get_available_transformations_choices
|
||||
from sources.managers import SourceTransformationManager
|
||||
|
||||
from .literals import (OCR_STATE_CHOICES, OCR_STATE_ENABLED,
|
||||
OCR_STATE_DISABLED)
|
||||
from .exceptions import (ReQueueError, OCRProcessingAlreadyDisabled,
|
||||
OCRProcessingAlreadyEnabled)
|
||||
|
||||
|
||||
class OCRProcessingSingleton(Singleton):
|
||||
state = models.CharField(max_length=4,
|
||||
choices=OCR_STATE_CHOICES,
|
||||
default=OCR_STATE_ENABLED,
|
||||
verbose_name=_(u'state'))
|
||||
|
||||
#objects = AnonymousUserSingletonManager()
|
||||
|
||||
def __unicode__(self):
|
||||
return ugettext('OCR processing')
|
||||
|
||||
def disable(self):
|
||||
if self.state == OCR_STATE_DISABLED:
|
||||
raise OCRProcessingAlreadyDisabled
|
||||
|
||||
self.state = OCR_STATE_DISABLED
|
||||
self.save()
|
||||
|
||||
def enable(self):
|
||||
if self.state == OCR_STATE_ENABLED:
|
||||
raise OCRProcessingAlreadyEnabled
|
||||
|
||||
self.state = OCR_STATE_ENABLED
|
||||
self.save()
|
||||
|
||||
def is_enabled(self):
|
||||
return self.state == OCR_STATE_ENABLED
|
||||
|
||||
class Meta:
|
||||
verbose_name = verbose_name_plural = _(u'OCR processing properties')
|
||||
@@ -6,16 +6,16 @@ import subprocess
|
||||
|
||||
from django.utils.translation import ugettext as _
|
||||
|
||||
#from converter import office_converter
|
||||
#from converter.office_converter import OfficeConverter
|
||||
#from converter.exceptions import OfficeConversionError
|
||||
#from documents.utils import document_save_to_temp_dir
|
||||
#from common.utils import copyfile
|
||||
#from common.settings import TEMPORARY_DIRECTORY
|
||||
#from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
|
||||
from converter import office_converter
|
||||
from converter.office_converter import OfficeConverter
|
||||
from converter.exceptions import OfficeConversionError
|
||||
from documents.utils import document_save_to_temp_dir
|
||||
from common.utils import copyfile
|
||||
from common.settings import TEMPORARY_DIRECTORY
|
||||
from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
|
||||
|
||||
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
|
||||
#from ocr.settings import PDFTOTEXT_PATH
|
||||
from ocr.settings import PDFTOTEXT_PATH
|
||||
|
||||
|
||||
mimetype_registry = {}
|
||||
|
||||
71
apps/ocr/post_init.py
Normal file
71
apps/ocr/post_init.py
Normal file
@@ -0,0 +1,71 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import logging
|
||||
|
||||
from django.db import transaction
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
from django.utils.translation import ugettext
|
||||
from django.db.models.signals import post_save, post_syncdb
|
||||
from django.dispatch import receiver
|
||||
from django.db.utils import DatabaseError
|
||||
|
||||
from navigation.api import (bind_links, register_multi_item_links,
|
||||
register_multi_item_links)
|
||||
from documents.models import Document, DocumentVersion
|
||||
from maintenance.api import MaintenanceNamespace
|
||||
from acls.api import class_permissions
|
||||
from job_processor.models import JobQueue, JobType
|
||||
from job_processor.exceptions import JobQueuePushError
|
||||
|
||||
from .settings import AUTOMATIC_OCR
|
||||
from .api import do_document_ocr
|
||||
from .permissions import PERMISSION_OCR_DOCUMENT
|
||||
from .exceptions import AlreadyQueued
|
||||
from .literals import OCR_QUEUE_NAME
|
||||
from .links import (submit_document, ocr_disable,
|
||||
ocr_enable, all_document_ocr_cleanup, ocr_log,
|
||||
ocr_tool_link, submit_document_multiple)
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
ocr_job_queue = None
|
||||
|
||||
|
||||
@transaction.commit_on_success
|
||||
def create_ocr_job_queue():
|
||||
global ocr_job_queue
|
||||
try:
|
||||
ocr_job_queue, created = JobQueue.objects.get_or_create(name=OCR_QUEUE_NAME, defaults={'label': _('OCR'), 'unique_jobs': True})
|
||||
except DatabaseError:
|
||||
transaction.rollback()
|
||||
|
||||
|
||||
@receiver(post_save, dispatch_uid='document_post_save', sender=DocumentVersion)
|
||||
def document_post_save(sender, instance, **kwargs):
|
||||
logger.debug('received post save signal')
|
||||
logger.debug('instance: %s' % instance)
|
||||
if kwargs.get('created', False):
|
||||
if AUTOMATIC_OCR:
|
||||
try:
|
||||
instance.submit_for_ocr()
|
||||
except JobQueuePushError:
|
||||
pass
|
||||
|
||||
|
||||
def init_ocr_app():
|
||||
bind_links([Document], [submit_document])
|
||||
bind_links([OCRProcessingSingleton], [ocr_disable, ocr_enable])
|
||||
|
||||
#namespace = MaintenanceNamespace(label=_(u'OCR'))
|
||||
#namespace.create_tool(all_document_ocr_cleanup)
|
||||
|
||||
register_multi_item_links(['folder_view', 'search', 'results', 'index_instance_node_view', 'document_find_duplicates', 'document_type_document_list', 'document_group_view', 'document_list', 'document_list_recent'], [submit_document_multiple])
|
||||
|
||||
class_permissions(Document, [
|
||||
PERMISSION_OCR_DOCUMENT,
|
||||
])
|
||||
|
||||
create_ocr_job_queue()
|
||||
ocr_job_type = JobType('ocr', _(u'OCR'), do_document_ocr)
|
||||
|
||||
Document.add_to_class('submit_for_ocr', lambda document: ocr_job_queue.push(ocr_job_type, document_version_pk=document.latest_version.pk))
|
||||
DocumentVersion.add_to_class('submit_for_ocr', lambda document_version: ocr_job_queue.push(ocr_job_type, document_version_pk=document_version.pk))
|
||||
@@ -2,60 +2,52 @@ from __future__ import absolute_import
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from smart_settings import LocalScope
|
||||
from smart_settings import LocalScope, ClusterScope
|
||||
|
||||
from .icons import icon_submit_document
|
||||
from .literals import (DEFAULT_TESSERACT_PATH, DEFAULT_TESSERACT_LANGUAGE,
|
||||
DEFAULT_REPLICATION_DELAY, DEFAULT_UNPAPER_PATH, DEFAULT_PDFTOTEXT_PATH)
|
||||
from .links import all_document_ocr_cleanup
|
||||
|
||||
label = _(u'OCR')
|
||||
description = _(u'Handles optical character recognition.')
|
||||
icon = icon_submit_document
|
||||
dependencies = ['app_registry', 'icons', 'navigation']
|
||||
#maintenance_links = [all_document_ocr_cleanup]
|
||||
settings = [
|
||||
{
|
||||
'name': 'AUTOMATIC_OCR',
|
||||
'default': True,
|
||||
'description': _(u'Automatically queue newly created documents for OCR.'),
|
||||
'scopes': [ClusterScope()]
|
||||
},
|
||||
{
|
||||
'name': 'TESSERACT_PATH',
|
||||
'default': u'/usr/bin/tesseract',
|
||||
'default': DEFAULT_TESSERACT_PATH,
|
||||
'exists': True,
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'TESSERACT_LANGUAGE',
|
||||
'default': u'eng',
|
||||
'scopes': [LocalScope()]
|
||||
'default': DEFAULT_TESSERACT_LANGUAGE,
|
||||
'scopes': [ClusterScope()]
|
||||
},
|
||||
{
|
||||
'name': 'REPLICATION_DELAY',
|
||||
'default': 0,
|
||||
'default': DEFAULT_REPLICATION_DELAY,
|
||||
'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'NODE_CONCURRENT_EXECUTION',
|
||||
'default': 1,
|
||||
'description': _(u'Maximum amount of concurrent document OCRs a node can perform.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'AUTOMATIC_OCR',
|
||||
'default': True,
|
||||
'description': _(u'Automatically queue newly created documents for OCR.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'QUEUE_PROCESSING_INTERVAL',
|
||||
'default': 10,
|
||||
'description': _(u'Automatically queue newly created documents for OCR.'),
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'UNPAPER_PATH',
|
||||
'default': u'/usr/bin/unpaper',
|
||||
'default': DEFAULT_UNPAPER_PATH,
|
||||
'description': _(u'File path to unpaper program.'),
|
||||
'exists': True,
|
||||
'scopes': [LocalScope()]
|
||||
},
|
||||
{
|
||||
'name': 'PDFTOTEXT_PATH',
|
||||
'default': u'/usr/bin/pdftotext',
|
||||
'default': DEFAULT_PDFTOTEXT_PATH,
|
||||
'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'),
|
||||
'exists': True,
|
||||
'scopes': [LocalScope()]
|
||||
|
||||
@@ -1,9 +1,6 @@
|
||||
from django.conf.urls.defaults import patterns, url
|
||||
|
||||
urlpatterns = patterns('ocr.views',
|
||||
url(r'^processing/enable/$', 'ocr_enable', (), 'ocr_enable'),
|
||||
url(r'^processing/disable/$', 'ocr_disable', (), 'ocr_disable'),
|
||||
|
||||
url(r'^document/(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
|
||||
url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'),
|
||||
|
||||
|
||||
@@ -19,11 +19,7 @@ from job_processor.exceptions import JobQueuePushError
|
||||
from .permissions import (PERMISSION_OCR_DOCUMENT,
|
||||
PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE,
|
||||
PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_QUEUE_EDIT)
|
||||
from .models import OCRProcessingSingleton
|
||||
from .exceptions import (AlreadyQueued, ReQueueError, OCRProcessingAlreadyDisabled,
|
||||
OCRProcessingAlreadyEnabled)
|
||||
from .api import clean_pages
|
||||
from . import ocr_job_queue, ocr_job_type
|
||||
|
||||
|
||||
# {'name': _(u'document'), 'attribute': encapsulate(lambda x: document_link(x.document_version.document) if hasattr(x, 'document_version') else _(u'Missing document.'))},
|
||||
@@ -32,58 +28,6 @@ from . import ocr_job_queue, ocr_job_type
|
||||
# {'name': _('submitted'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True},
|
||||
|
||||
|
||||
def ocr_disable(request):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_QUEUE_ENABLE_DISABLE])
|
||||
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
OCRProcessingSingleton.get().disable()
|
||||
except OCRProcessingAlreadyDisabled:
|
||||
messages.warning(request, _(u'OCR processing already disabled.'))
|
||||
return HttpResponseRedirect(previous)
|
||||
else:
|
||||
messages.success(request, _(u'OCR processing disabled successfully.'))
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
return render_to_response('generic_confirm.html', {
|
||||
'queue': OCRProcessingSingleton.get(),
|
||||
'navigation_object_name': 'queue',
|
||||
'title': _(u'Are you sure you wish to disable OCR processing?'),
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
'form_icon': u'control_stop_blue.png',
|
||||
}, context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def ocr_enable(request):
|
||||
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_QUEUE_ENABLE_DISABLE])
|
||||
|
||||
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
|
||||
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
|
||||
|
||||
if request.method == 'POST':
|
||||
try:
|
||||
OCRProcessingSingleton.get().enable()
|
||||
except OCRProcessingAlreadyDisabled:
|
||||
messages.warning(request, _(u'OCR processing already enabled.'))
|
||||
return HttpResponseRedirect(previous)
|
||||
else:
|
||||
messages.success(request, _(u'OCR processing enabled successfully.'))
|
||||
return HttpResponseRedirect(next)
|
||||
|
||||
return render_to_response('generic_confirm.html', {
|
||||
'queue': OCRProcessingSingleton.get(),
|
||||
'navigation_object_name': 'queue',
|
||||
'title': _(u'Are you sure you wish to enable OCR processing?'),
|
||||
'next': next,
|
||||
'previous': previous,
|
||||
'form_icon': u'control_play_blue.png',
|
||||
}, context_instance=RequestContext(request))
|
||||
|
||||
|
||||
def submit_document_multiple(request):
|
||||
for item_id in request.GET.get('id_list', '').split(','):
|
||||
submit_document(request, item_id)
|
||||
|
||||
Reference in New Issue
Block a user