Update and re-enable ocr app

This commit is contained in:
Roberto Rosario
2012-09-16 03:30:32 -04:00
parent a4bbc65508
commit 6f585a2836
14 changed files with 109 additions and 268 deletions

View File

@@ -1,75 +1 @@
from __future__ import absolute_import
import logging
from django.db import transaction
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext
from django.db.models.signals import post_save, post_syncdb
from django.dispatch import receiver
from django.db.utils import DatabaseError
#from navigation.api import (bind_links, register_multi_item_links,
# register_multi_item_links)
#from documents.models import Document, DocumentVersion
from maintenance.api import MaintenanceNamespace
from project_tools.api import register_tool
from acls.api import class_permissions
from job_processor.models import JobQueue, JobType
from job_processor.exceptions import JobQueuePushError
#from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL)
#from .models import OCRProcessingSingleton
#from .api import do_document_ocr
from .permissions import PERMISSION_OCR_DOCUMENT
from .exceptions import AlreadyQueued
from . import models as ocr_models
from .literals import OCR_QUEUE_NAME
logger = logging.getLogger(__name__)
ocr_job_queue = None
from .links import (submit_document, ocr_disable,
ocr_enable, all_document_ocr_cleanup, ocr_log,
ocr_tool_link, submit_document_multiple)
bind_links([Document], [submit_document])
bind_links([OCRProcessingSingleton], [ocr_disable, ocr_enable])
namespace = MaintenanceNamespace(label=_(u'OCR'))
namespace.create_tool(all_document_ocr_cleanup)
register_multi_item_links(['folder_view', 'search', 'results', 'index_instance_node_view', 'document_find_duplicates', 'document_type_document_list', 'document_group_view', 'document_list', 'document_list_recent'], [submit_document_multiple])
@transaction.commit_on_success
def create_ocr_job_queue():
global ocr_job_queue
try:
ocr_job_queue, created = JobQueue.objects.get_or_create(name=OCR_QUEUE_NAME, defaults={'label': _('OCR'), 'unique_jobs': True})
except DatabaseError:
transaction.rollback()
@receiver(post_save, dispatch_uid='document_post_save', sender=DocumentVersion)
def document_post_save(sender, instance, **kwargs):
logger.debug('received post save signal')
logger.debug('instance: %s' % instance)
if kwargs.get('created', False):
#if AUTOMATIC_OCR:
try:
instance.submit_for_ocr()
except JobQueuePushError:
pass
register_tool(ocr_tool_link)
class_permissions(Document, [
PERMISSION_OCR_DOCUMENT,
])
create_ocr_job_queue()
ocr_job_type = JobType('ocr', _(u'OCR'), do_document_ocr)
Document.add_to_class('submit_for_ocr', lambda document: ocr_job_queue.push(ocr_job_type, document_version_pk=document.latest_version.pk))
DocumentVersion.add_to_class('submit_for_ocr', lambda document_version: ocr_job_queue.push(ocr_job_type, document_version_pk=document_version.pk))

View File

@@ -10,11 +10,11 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
#from common.settings import TEMPORARY_DIRECTORY
#from converter.api import convert
from common.settings import TEMPORARY_DIRECTORY
from converter.api import convert
from documents.models import DocumentPage, DocumentVersion
#from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH)
from .settings import TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH
from .exceptions import TesseractError, UnpaperError
from .parsers import parse_document_page
from .parsers.exceptions import ParserError, ParserUnknownFile

View File

@@ -1,10 +1,3 @@
class AlreadyQueued(Exception):
"""
Raised when a trying to queue document already in the queue
"""
pass
class TesseractError(Exception):
"""
Raised by tesseract
@@ -17,15 +10,3 @@ class UnpaperError(Exception):
Raised by unpaper
"""
pass
class ReQueueError(Exception):
pass
class OCRProcessingAlreadyDisabled(Exception):
pass
class OCRProcessingAlreadyEnabled(Exception):
pass

View File

@@ -7,22 +7,9 @@ from navigation import Link
from .permissions import (PERMISSION_OCR_DOCUMENT,
PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE,
PERMISSION_OCR_CLEAN_ALL_PAGES)
from .models import OCRProcessingSingleton
from .icons import icon_submit_document, icon_ocr_cleanup
def is_enabled(context):
return OCRProcessingSingleton.get().is_enabled()
def is_disabled(context):
return not OCRProcessingSingleton.get().is_enabled()
#ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT])
#ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled)
#ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled)
submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT])
all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', icon=icon_ocr_cleanup, permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.'))
#ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue'])

View File

@@ -1,16 +1,12 @@
from django.utils.translation import ugettext_lazy as _
OCR_STATE_DISABLED = 'd'
OCR_STATE_ENABLED = 'e'
OCR_STATE_CHOICES = (
(OCR_STATE_DISABLED, _(u'disabled')),
(OCR_STATE_ENABLED, _(u'enabled')),
)
DEFAULT_OCR_FILE_FORMAT = u'tiff'
DEFAULT_OCR_FILE_EXTENSION = u'tif'
UNPAPER_FILE_FORMAT = u'ppm'
OCR_QUEUE_NAME = 'ocr'
DEFAULT_TESSERACT_PATH = u'/usr/bin/tesseract'
DEFAULT_UNPAPER_PATH = u'/usr/bin/unpaper'
DEFAULT_PDFTOTEXT_PATH = u'/usr/bin/pdftotext'
DEFAULT_TESSERACT_LANGUAGE = u'eng'
DEFAULT_REPLICATION_DELAY = 0

View File

@@ -1,54 +0,0 @@
from __future__ import absolute_import
from ast import literal_eval
import datetime
from django.db import models
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext
from django.core.exceptions import ObjectDoesNotExist
from django.contrib.contenttypes.models import ContentType
from django.contrib.contenttypes import generic
from django.core.exceptions import ValidationError
from common.models import Singleton
from documents.models import Document, DocumentVersion
from converter.api import get_available_transformations_choices
from sources.managers import SourceTransformationManager
from .literals import (OCR_STATE_CHOICES, OCR_STATE_ENABLED,
OCR_STATE_DISABLED)
from .exceptions import (ReQueueError, OCRProcessingAlreadyDisabled,
OCRProcessingAlreadyEnabled)
class OCRProcessingSingleton(Singleton):
state = models.CharField(max_length=4,
choices=OCR_STATE_CHOICES,
default=OCR_STATE_ENABLED,
verbose_name=_(u'state'))
#objects = AnonymousUserSingletonManager()
def __unicode__(self):
return ugettext('OCR processing')
def disable(self):
if self.state == OCR_STATE_DISABLED:
raise OCRProcessingAlreadyDisabled
self.state = OCR_STATE_DISABLED
self.save()
def enable(self):
if self.state == OCR_STATE_ENABLED:
raise OCRProcessingAlreadyEnabled
self.state = OCR_STATE_ENABLED
self.save()
def is_enabled(self):
return self.state == OCR_STATE_ENABLED
class Meta:
verbose_name = verbose_name_plural = _(u'OCR processing properties')

View File

@@ -6,16 +6,16 @@ import subprocess
from django.utils.translation import ugettext as _
#from converter import office_converter
#from converter.office_converter import OfficeConverter
#from converter.exceptions import OfficeConversionError
#from documents.utils import document_save_to_temp_dir
#from common.utils import copyfile
#from common.settings import TEMPORARY_DIRECTORY
#from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
from converter import office_converter
from converter.office_converter import OfficeConverter
from converter.exceptions import OfficeConversionError
from documents.utils import document_save_to_temp_dir
from common.utils import copyfile
from common.settings import TEMPORARY_DIRECTORY
from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
#from ocr.settings import PDFTOTEXT_PATH
from ocr.settings import PDFTOTEXT_PATH
mimetype_registry = {}

71
apps/ocr/post_init.py Normal file
View File

@@ -0,0 +1,71 @@
from __future__ import absolute_import
import logging
from django.db import transaction
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext
from django.db.models.signals import post_save, post_syncdb
from django.dispatch import receiver
from django.db.utils import DatabaseError
from navigation.api import (bind_links, register_multi_item_links,
register_multi_item_links)
from documents.models import Document, DocumentVersion
from maintenance.api import MaintenanceNamespace
from acls.api import class_permissions
from job_processor.models import JobQueue, JobType
from job_processor.exceptions import JobQueuePushError
from .settings import AUTOMATIC_OCR
from .api import do_document_ocr
from .permissions import PERMISSION_OCR_DOCUMENT
from .exceptions import AlreadyQueued
from .literals import OCR_QUEUE_NAME
from .links import (submit_document, ocr_disable,
ocr_enable, all_document_ocr_cleanup, ocr_log,
ocr_tool_link, submit_document_multiple)
logger = logging.getLogger(__name__)
ocr_job_queue = None
@transaction.commit_on_success
def create_ocr_job_queue():
global ocr_job_queue
try:
ocr_job_queue, created = JobQueue.objects.get_or_create(name=OCR_QUEUE_NAME, defaults={'label': _('OCR'), 'unique_jobs': True})
except DatabaseError:
transaction.rollback()
@receiver(post_save, dispatch_uid='document_post_save', sender=DocumentVersion)
def document_post_save(sender, instance, **kwargs):
logger.debug('received post save signal')
logger.debug('instance: %s' % instance)
if kwargs.get('created', False):
if AUTOMATIC_OCR:
try:
instance.submit_for_ocr()
except JobQueuePushError:
pass
def init_ocr_app():
bind_links([Document], [submit_document])
bind_links([OCRProcessingSingleton], [ocr_disable, ocr_enable])
#namespace = MaintenanceNamespace(label=_(u'OCR'))
#namespace.create_tool(all_document_ocr_cleanup)
register_multi_item_links(['folder_view', 'search', 'results', 'index_instance_node_view', 'document_find_duplicates', 'document_type_document_list', 'document_group_view', 'document_list', 'document_list_recent'], [submit_document_multiple])
class_permissions(Document, [
PERMISSION_OCR_DOCUMENT,
])
create_ocr_job_queue()
ocr_job_type = JobType('ocr', _(u'OCR'), do_document_ocr)
Document.add_to_class('submit_for_ocr', lambda document: ocr_job_queue.push(ocr_job_type, document_version_pk=document.latest_version.pk))
DocumentVersion.add_to_class('submit_for_ocr', lambda document_version: ocr_job_queue.push(ocr_job_type, document_version_pk=document_version.pk))

View File

@@ -2,60 +2,52 @@ from __future__ import absolute_import
from django.utils.translation import ugettext_lazy as _
from smart_settings import LocalScope
from smart_settings import LocalScope, ClusterScope
from .icons import icon_submit_document
from .literals import (DEFAULT_TESSERACT_PATH, DEFAULT_TESSERACT_LANGUAGE,
DEFAULT_REPLICATION_DELAY, DEFAULT_UNPAPER_PATH, DEFAULT_PDFTOTEXT_PATH)
from .links import all_document_ocr_cleanup
label = _(u'OCR')
description = _(u'Handles optical character recognition.')
icon = icon_submit_document
dependencies = ['app_registry', 'icons', 'navigation']
#maintenance_links = [all_document_ocr_cleanup]
settings = [
{
'name': 'AUTOMATIC_OCR',
'default': True,
'description': _(u'Automatically queue newly created documents for OCR.'),
'scopes': [ClusterScope()]
},
{
'name': 'TESSERACT_PATH',
'default': u'/usr/bin/tesseract',
'default': DEFAULT_TESSERACT_PATH,
'exists': True,
'scopes': [LocalScope()]
},
{
'name': 'TESSERACT_LANGUAGE',
'default': u'eng',
'scopes': [LocalScope()]
'default': DEFAULT_TESSERACT_LANGUAGE,
'scopes': [ClusterScope()]
},
{
'name': 'REPLICATION_DELAY',
'default': 0,
'default': DEFAULT_REPLICATION_DELAY,
'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'),
'scopes': [LocalScope()]
},
{
'name': 'NODE_CONCURRENT_EXECUTION',
'default': 1,
'description': _(u'Maximum amount of concurrent document OCRs a node can perform.'),
'scopes': [LocalScope()]
},
{
'name': 'AUTOMATIC_OCR',
'default': True,
'description': _(u'Automatically queue newly created documents for OCR.'),
'scopes': [LocalScope()]
},
{
'name': 'QUEUE_PROCESSING_INTERVAL',
'default': 10,
'description': _(u'Automatically queue newly created documents for OCR.'),
'scopes': [LocalScope()]
},
{
'name': 'UNPAPER_PATH',
'default': u'/usr/bin/unpaper',
'default': DEFAULT_UNPAPER_PATH,
'description': _(u'File path to unpaper program.'),
'exists': True,
'scopes': [LocalScope()]
},
{
'name': 'PDFTOTEXT_PATH',
'default': u'/usr/bin/pdftotext',
'default': DEFAULT_PDFTOTEXT_PATH,
'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'),
'exists': True,
'scopes': [LocalScope()]

View File

@@ -1,9 +1,6 @@
from django.conf.urls.defaults import patterns, url
urlpatterns = patterns('ocr.views',
url(r'^processing/enable/$', 'ocr_enable', (), 'ocr_enable'),
url(r'^processing/disable/$', 'ocr_disable', (), 'ocr_disable'),
url(r'^document/(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'),

View File

@@ -19,11 +19,7 @@ from job_processor.exceptions import JobQueuePushError
from .permissions import (PERMISSION_OCR_DOCUMENT,
PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE,
PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_QUEUE_EDIT)
from .models import OCRProcessingSingleton
from .exceptions import (AlreadyQueued, ReQueueError, OCRProcessingAlreadyDisabled,
OCRProcessingAlreadyEnabled)
from .api import clean_pages
from . import ocr_job_queue, ocr_job_type
# {'name': _(u'document'), 'attribute': encapsulate(lambda x: document_link(x.document_version.document) if hasattr(x, 'document_version') else _(u'Missing document.'))},
@@ -32,58 +28,6 @@ from . import ocr_job_queue, ocr_job_type
# {'name': _('submitted'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True},
def ocr_disable(request):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_QUEUE_ENABLE_DISABLE])
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
if request.method == 'POST':
try:
OCRProcessingSingleton.get().disable()
except OCRProcessingAlreadyDisabled:
messages.warning(request, _(u'OCR processing already disabled.'))
return HttpResponseRedirect(previous)
else:
messages.success(request, _(u'OCR processing disabled successfully.'))
return HttpResponseRedirect(next)
return render_to_response('generic_confirm.html', {
'queue': OCRProcessingSingleton.get(),
'navigation_object_name': 'queue',
'title': _(u'Are you sure you wish to disable OCR processing?'),
'next': next,
'previous': previous,
'form_icon': u'control_stop_blue.png',
}, context_instance=RequestContext(request))
def ocr_enable(request):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_QUEUE_ENABLE_DISABLE])
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
if request.method == 'POST':
try:
OCRProcessingSingleton.get().enable()
except OCRProcessingAlreadyDisabled:
messages.warning(request, _(u'OCR processing already enabled.'))
return HttpResponseRedirect(previous)
else:
messages.success(request, _(u'OCR processing enabled successfully.'))
return HttpResponseRedirect(next)
return render_to_response('generic_confirm.html', {
'queue': OCRProcessingSingleton.get(),
'navigation_object_name': 'queue',
'title': _(u'Are you sure you wish to enable OCR processing?'),
'next': next,
'previous': previous,
'form_icon': u'control_play_blue.png',
}, context_instance=RequestContext(request))
def submit_document_multiple(request):
for item_id in request.GET.get('id_list', '').split(','):
submit_document(request, item_id)