From 6f585a28362ba537c54eccee8e4c716ba3fb2899 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Sun, 16 Sep 2012 03:30:32 -0400 Subject: [PATCH] Update and re-enable ocr app --- apps/ocr/__init__.py | 74 ------------------------------------ apps/ocr/api.py | 6 +-- apps/ocr/exceptions.py | 19 --------- apps/ocr/links.py | 15 +------- apps/ocr/literals.py | 16 +++----- apps/ocr/models.py | 54 -------------------------- apps/ocr/parsers/__init__.py | 16 ++++---- apps/ocr/post_init.py | 71 ++++++++++++++++++++++++++++++++++ apps/ocr/registry.py | 42 +++++++++----------- apps/ocr/urls.py | 3 -- apps/ocr/views.py | 56 --------------------------- docs/releases/0.13.rst | 1 + settings.py | 2 +- urls.py | 2 +- 14 files changed, 109 insertions(+), 268 deletions(-) delete mode 100644 apps/ocr/models.py create mode 100644 apps/ocr/post_init.py diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py index d0b006548e..8b13789179 100644 --- a/apps/ocr/__init__.py +++ b/apps/ocr/__init__.py @@ -1,75 +1 @@ -from __future__ import absolute_import -import logging - -from django.db import transaction -from django.utils.translation import ugettext_lazy as _ -from django.utils.translation import ugettext -from django.db.models.signals import post_save, post_syncdb -from django.dispatch import receiver -from django.db.utils import DatabaseError - -#from navigation.api import (bind_links, register_multi_item_links, -# register_multi_item_links) -#from documents.models import Document, DocumentVersion -from maintenance.api import MaintenanceNamespace -from project_tools.api import register_tool -from acls.api import class_permissions -from job_processor.models import JobQueue, JobType -from job_processor.exceptions import JobQueuePushError - -#from .conf.settings import (AUTOMATIC_OCR, QUEUE_PROCESSING_INTERVAL) -#from .models import OCRProcessingSingleton -#from .api import do_document_ocr -from .permissions import PERMISSION_OCR_DOCUMENT -from .exceptions import AlreadyQueued -from . import models as ocr_models -from .literals import OCR_QUEUE_NAME - -logger = logging.getLogger(__name__) -ocr_job_queue = None - -from .links import (submit_document, ocr_disable, - ocr_enable, all_document_ocr_cleanup, ocr_log, - ocr_tool_link, submit_document_multiple) - -bind_links([Document], [submit_document]) -bind_links([OCRProcessingSingleton], [ocr_disable, ocr_enable]) - -namespace = MaintenanceNamespace(label=_(u'OCR')) -namespace.create_tool(all_document_ocr_cleanup) -register_multi_item_links(['folder_view', 'search', 'results', 'index_instance_node_view', 'document_find_duplicates', 'document_type_document_list', 'document_group_view', 'document_list', 'document_list_recent'], [submit_document_multiple]) - - -@transaction.commit_on_success -def create_ocr_job_queue(): - global ocr_job_queue - try: - ocr_job_queue, created = JobQueue.objects.get_or_create(name=OCR_QUEUE_NAME, defaults={'label': _('OCR'), 'unique_jobs': True}) - except DatabaseError: - transaction.rollback() - - -@receiver(post_save, dispatch_uid='document_post_save', sender=DocumentVersion) -def document_post_save(sender, instance, **kwargs): - logger.debug('received post save signal') - logger.debug('instance: %s' % instance) - if kwargs.get('created', False): - #if AUTOMATIC_OCR: - try: - instance.submit_for_ocr() - except JobQueuePushError: - pass - - -register_tool(ocr_tool_link) - -class_permissions(Document, [ - PERMISSION_OCR_DOCUMENT, -]) - -create_ocr_job_queue() -ocr_job_type = JobType('ocr', _(u'OCR'), do_document_ocr) - -Document.add_to_class('submit_for_ocr', lambda document: ocr_job_queue.push(ocr_job_type, document_version_pk=document.latest_version.pk)) -DocumentVersion.add_to_class('submit_for_ocr', lambda document_version: ocr_job_queue.push(ocr_job_type, document_version_pk=document_version.pk)) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 8078ba3843..c6e45cf7cb 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -10,11 +10,11 @@ import sys from django.utils.translation import ugettext as _ from django.utils.importlib import import_module -#from common.settings import TEMPORARY_DIRECTORY -#from converter.api import convert +from common.settings import TEMPORARY_DIRECTORY +from converter.api import convert from documents.models import DocumentPage, DocumentVersion -#from .conf.settings import (TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH) +from .settings import TESSERACT_PATH, TESSERACT_LANGUAGE, UNPAPER_PATH from .exceptions import TesseractError, UnpaperError from .parsers import parse_document_page from .parsers.exceptions import ParserError, ParserUnknownFile diff --git a/apps/ocr/exceptions.py b/apps/ocr/exceptions.py index 27d72374b9..3e30733aed 100644 --- a/apps/ocr/exceptions.py +++ b/apps/ocr/exceptions.py @@ -1,10 +1,3 @@ -class AlreadyQueued(Exception): - """ - Raised when a trying to queue document already in the queue - """ - pass - - class TesseractError(Exception): """ Raised by tesseract @@ -17,15 +10,3 @@ class UnpaperError(Exception): Raised by unpaper """ pass - - -class ReQueueError(Exception): - pass - - -class OCRProcessingAlreadyDisabled(Exception): - pass - - -class OCRProcessingAlreadyEnabled(Exception): - pass diff --git a/apps/ocr/links.py b/apps/ocr/links.py index 5c5bc3a6ce..a3fdaeefd9 100644 --- a/apps/ocr/links.py +++ b/apps/ocr/links.py @@ -7,22 +7,9 @@ from navigation import Link from .permissions import (PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES) -from .models import OCRProcessingSingleton from .icons import icon_submit_document, icon_ocr_cleanup - -def is_enabled(context): - return OCRProcessingSingleton.get().is_enabled() - -def is_disabled(context): - return not OCRProcessingSingleton.get().is_enabled() - - -#ocr_log = Link(text=_(u'queue document list'), view='ocr_log', sprite='text', permissions=[PERMISSION_OCR_DOCUMENT]) -#ocr_disable = Link(text=_(u'disable OCR processing'), view='ocr_disable', sprite='control_stop_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_disabled) -#ocr_enable = Link(text=_(u'enable OCR processing'), view='ocr_enable', sprite='control_play_blue', permissions=[PERMISSION_OCR_QUEUE_ENABLE_DISABLE], conditional_disable=is_enabled) + submit_document = Link(text=_('submit to OCR queue'), view='submit_document', args='object.id', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT]) submit_document_multiple = Link(text=_('submit to OCR queue'), view='submit_document_multiple', icon=icon_submit_document, permissions=[PERMISSION_OCR_DOCUMENT]) all_document_ocr_cleanup = Link(text=_(u'clean up pages content'), view='all_document_ocr_cleanup', icon=icon_ocr_cleanup, permissions=[PERMISSION_OCR_CLEAN_ALL_PAGES], description=_(u'Runs a language filter to remove common OCR mistakes from document pages content.')) - -#ocr_tool_link = Link(text=_(u'OCR'), view='ocr_log', sprite='hourglass', icon='text.png', permissions=[PERMISSION_OCR_DOCUMENT]) # children_view_regex=[r'queue_', r'document_queue']) diff --git a/apps/ocr/literals.py b/apps/ocr/literals.py index b7d10f8615..092aef22e0 100644 --- a/apps/ocr/literals.py +++ b/apps/ocr/literals.py @@ -1,16 +1,12 @@ from django.utils.translation import ugettext_lazy as _ - -OCR_STATE_DISABLED = 'd' -OCR_STATE_ENABLED = 'e' - -OCR_STATE_CHOICES = ( - (OCR_STATE_DISABLED, _(u'disabled')), - (OCR_STATE_ENABLED, _(u'enabled')), -) - DEFAULT_OCR_FILE_FORMAT = u'tiff' DEFAULT_OCR_FILE_EXTENSION = u'tif' UNPAPER_FILE_FORMAT = u'ppm' - OCR_QUEUE_NAME = 'ocr' + +DEFAULT_TESSERACT_PATH = u'/usr/bin/tesseract' +DEFAULT_UNPAPER_PATH = u'/usr/bin/unpaper' +DEFAULT_PDFTOTEXT_PATH = u'/usr/bin/pdftotext' +DEFAULT_TESSERACT_LANGUAGE = u'eng' +DEFAULT_REPLICATION_DELAY = 0 diff --git a/apps/ocr/models.py b/apps/ocr/models.py deleted file mode 100644 index 552e124c50..0000000000 --- a/apps/ocr/models.py +++ /dev/null @@ -1,54 +0,0 @@ -from __future__ import absolute_import - -from ast import literal_eval -import datetime - -from django.db import models -from django.utils.translation import ugettext_lazy as _ -from django.utils.translation import ugettext -from django.core.exceptions import ObjectDoesNotExist -from django.contrib.contenttypes.models import ContentType -from django.contrib.contenttypes import generic -from django.core.exceptions import ValidationError - -from common.models import Singleton -from documents.models import Document, DocumentVersion -from converter.api import get_available_transformations_choices -from sources.managers import SourceTransformationManager - -from .literals import (OCR_STATE_CHOICES, OCR_STATE_ENABLED, - OCR_STATE_DISABLED) -from .exceptions import (ReQueueError, OCRProcessingAlreadyDisabled, - OCRProcessingAlreadyEnabled) - - -class OCRProcessingSingleton(Singleton): - state = models.CharField(max_length=4, - choices=OCR_STATE_CHOICES, - default=OCR_STATE_ENABLED, - verbose_name=_(u'state')) - - #objects = AnonymousUserSingletonManager() - - def __unicode__(self): - return ugettext('OCR processing') - - def disable(self): - if self.state == OCR_STATE_DISABLED: - raise OCRProcessingAlreadyDisabled - - self.state = OCR_STATE_DISABLED - self.save() - - def enable(self): - if self.state == OCR_STATE_ENABLED: - raise OCRProcessingAlreadyEnabled - - self.state = OCR_STATE_ENABLED - self.save() - - def is_enabled(self): - return self.state == OCR_STATE_ENABLED - - class Meta: - verbose_name = verbose_name_plural = _(u'OCR processing properties') diff --git a/apps/ocr/parsers/__init__.py b/apps/ocr/parsers/__init__.py index 897476fc3d..fc7289b247 100644 --- a/apps/ocr/parsers/__init__.py +++ b/apps/ocr/parsers/__init__.py @@ -6,16 +6,16 @@ import subprocess from django.utils.translation import ugettext as _ -#from converter import office_converter -#from converter.office_converter import OfficeConverter -#from converter.exceptions import OfficeConversionError -#from documents.utils import document_save_to_temp_dir -#from common.utils import copyfile -#from common.settings import TEMPORARY_DIRECTORY -#from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES +from converter import office_converter +from converter.office_converter import OfficeConverter +from converter.exceptions import OfficeConversionError +from documents.utils import document_save_to_temp_dir +from common.utils import copyfile +from common.settings import TEMPORARY_DIRECTORY +from common.textparser import TextParser as OriginalTextParser, TEXT_PARSER_MIMETYPES from ocr.parsers.exceptions import ParserError, ParserUnknownFile -#from ocr.settings import PDFTOTEXT_PATH +from ocr.settings import PDFTOTEXT_PATH mimetype_registry = {} diff --git a/apps/ocr/post_init.py b/apps/ocr/post_init.py new file mode 100644 index 0000000000..d59cf9488b --- /dev/null +++ b/apps/ocr/post_init.py @@ -0,0 +1,71 @@ +from __future__ import absolute_import + +import logging + +from django.db import transaction +from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext +from django.db.models.signals import post_save, post_syncdb +from django.dispatch import receiver +from django.db.utils import DatabaseError + +from navigation.api import (bind_links, register_multi_item_links, + register_multi_item_links) +from documents.models import Document, DocumentVersion +from maintenance.api import MaintenanceNamespace +from acls.api import class_permissions +from job_processor.models import JobQueue, JobType +from job_processor.exceptions import JobQueuePushError + +from .settings import AUTOMATIC_OCR +from .api import do_document_ocr +from .permissions import PERMISSION_OCR_DOCUMENT +from .exceptions import AlreadyQueued +from .literals import OCR_QUEUE_NAME +from .links import (submit_document, ocr_disable, + ocr_enable, all_document_ocr_cleanup, ocr_log, + ocr_tool_link, submit_document_multiple) + +logger = logging.getLogger(__name__) +ocr_job_queue = None + + +@transaction.commit_on_success +def create_ocr_job_queue(): + global ocr_job_queue + try: + ocr_job_queue, created = JobQueue.objects.get_or_create(name=OCR_QUEUE_NAME, defaults={'label': _('OCR'), 'unique_jobs': True}) + except DatabaseError: + transaction.rollback() + + +@receiver(post_save, dispatch_uid='document_post_save', sender=DocumentVersion) +def document_post_save(sender, instance, **kwargs): + logger.debug('received post save signal') + logger.debug('instance: %s' % instance) + if kwargs.get('created', False): + if AUTOMATIC_OCR: + try: + instance.submit_for_ocr() + except JobQueuePushError: + pass + + +def init_ocr_app(): + bind_links([Document], [submit_document]) + bind_links([OCRProcessingSingleton], [ocr_disable, ocr_enable]) + + #namespace = MaintenanceNamespace(label=_(u'OCR')) + #namespace.create_tool(all_document_ocr_cleanup) + + register_multi_item_links(['folder_view', 'search', 'results', 'index_instance_node_view', 'document_find_duplicates', 'document_type_document_list', 'document_group_view', 'document_list', 'document_list_recent'], [submit_document_multiple]) + + class_permissions(Document, [ + PERMISSION_OCR_DOCUMENT, + ]) + + create_ocr_job_queue() + ocr_job_type = JobType('ocr', _(u'OCR'), do_document_ocr) + + Document.add_to_class('submit_for_ocr', lambda document: ocr_job_queue.push(ocr_job_type, document_version_pk=document.latest_version.pk)) + DocumentVersion.add_to_class('submit_for_ocr', lambda document_version: ocr_job_queue.push(ocr_job_type, document_version_pk=document_version.pk)) diff --git a/apps/ocr/registry.py b/apps/ocr/registry.py index 7df6b324e8..a8449997cd 100644 --- a/apps/ocr/registry.py +++ b/apps/ocr/registry.py @@ -2,60 +2,52 @@ from __future__ import absolute_import from django.utils.translation import ugettext_lazy as _ -from smart_settings import LocalScope +from smart_settings import LocalScope, ClusterScope from .icons import icon_submit_document +from .literals import (DEFAULT_TESSERACT_PATH, DEFAULT_TESSERACT_LANGUAGE, + DEFAULT_REPLICATION_DELAY, DEFAULT_UNPAPER_PATH, DEFAULT_PDFTOTEXT_PATH) +from .links import all_document_ocr_cleanup label = _(u'OCR') description = _(u'Handles optical character recognition.') icon = icon_submit_document dependencies = ['app_registry', 'icons', 'navigation'] +#maintenance_links = [all_document_ocr_cleanup] settings = [ + { + 'name': 'AUTOMATIC_OCR', + 'default': True, + 'description': _(u'Automatically queue newly created documents for OCR.'), + 'scopes': [ClusterScope()] + }, { 'name': 'TESSERACT_PATH', - 'default': u'/usr/bin/tesseract', + 'default': DEFAULT_TESSERACT_PATH, 'exists': True, 'scopes': [LocalScope()] }, { 'name': 'TESSERACT_LANGUAGE', - 'default': u'eng', - 'scopes': [LocalScope()] + 'default': DEFAULT_TESSERACT_LANGUAGE, + 'scopes': [ClusterScope()] }, { 'name': 'REPLICATION_DELAY', - 'default': 0, + 'default': DEFAULT_REPLICATION_DELAY, 'description': _(u'Amount of seconds to delay OCR of documents to allow for the node\'s storage replication overhead.'), 'scopes': [LocalScope()] }, - { - 'name': 'NODE_CONCURRENT_EXECUTION', - 'default': 1, - 'description': _(u'Maximum amount of concurrent document OCRs a node can perform.'), - 'scopes': [LocalScope()] - }, - { - 'name': 'AUTOMATIC_OCR', - 'default': True, - 'description': _(u'Automatically queue newly created documents for OCR.'), - 'scopes': [LocalScope()] - }, - { - 'name': 'QUEUE_PROCESSING_INTERVAL', - 'default': 10, - 'description': _(u'Automatically queue newly created documents for OCR.'), - 'scopes': [LocalScope()] - }, { 'name': 'UNPAPER_PATH', - 'default': u'/usr/bin/unpaper', + 'default': DEFAULT_UNPAPER_PATH, 'description': _(u'File path to unpaper program.'), 'exists': True, 'scopes': [LocalScope()] }, { 'name': 'PDFTOTEXT_PATH', - 'default': u'/usr/bin/pdftotext', + 'default': DEFAULT_PDFTOTEXT_PATH, 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True, 'scopes': [LocalScope()] diff --git a/apps/ocr/urls.py b/apps/ocr/urls.py index 9d8666346b..8663bf24f3 100644 --- a/apps/ocr/urls.py +++ b/apps/ocr/urls.py @@ -1,9 +1,6 @@ from django.conf.urls.defaults import patterns, url urlpatterns = patterns('ocr.views', - url(r'^processing/enable/$', 'ocr_enable', (), 'ocr_enable'), - url(r'^processing/disable/$', 'ocr_disable', (), 'ocr_disable'), - url(r'^document/(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'), diff --git a/apps/ocr/views.py b/apps/ocr/views.py index 34cee09703..b68a9d657a 100644 --- a/apps/ocr/views.py +++ b/apps/ocr/views.py @@ -19,11 +19,7 @@ from job_processor.exceptions import JobQueuePushError from .permissions import (PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE, PERMISSION_OCR_QUEUE_ENABLE_DISABLE, PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_QUEUE_EDIT) -from .models import OCRProcessingSingleton -from .exceptions import (AlreadyQueued, ReQueueError, OCRProcessingAlreadyDisabled, - OCRProcessingAlreadyEnabled) from .api import clean_pages -from . import ocr_job_queue, ocr_job_type # {'name': _(u'document'), 'attribute': encapsulate(lambda x: document_link(x.document_version.document) if hasattr(x, 'document_version') else _(u'Missing document.'))}, @@ -32,58 +28,6 @@ from . import ocr_job_queue, ocr_job_type # {'name': _('submitted'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True}, -def ocr_disable(request): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]) - - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) - - if request.method == 'POST': - try: - OCRProcessingSingleton.get().disable() - except OCRProcessingAlreadyDisabled: - messages.warning(request, _(u'OCR processing already disabled.')) - return HttpResponseRedirect(previous) - else: - messages.success(request, _(u'OCR processing disabled successfully.')) - return HttpResponseRedirect(next) - - return render_to_response('generic_confirm.html', { - 'queue': OCRProcessingSingleton.get(), - 'navigation_object_name': 'queue', - 'title': _(u'Are you sure you wish to disable OCR processing?'), - 'next': next, - 'previous': previous, - 'form_icon': u'control_stop_blue.png', - }, context_instance=RequestContext(request)) - - -def ocr_enable(request): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_QUEUE_ENABLE_DISABLE]) - - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) - - if request.method == 'POST': - try: - OCRProcessingSingleton.get().enable() - except OCRProcessingAlreadyDisabled: - messages.warning(request, _(u'OCR processing already enabled.')) - return HttpResponseRedirect(previous) - else: - messages.success(request, _(u'OCR processing enabled successfully.')) - return HttpResponseRedirect(next) - - return render_to_response('generic_confirm.html', { - 'queue': OCRProcessingSingleton.get(), - 'navigation_object_name': 'queue', - 'title': _(u'Are you sure you wish to enable OCR processing?'), - 'next': next, - 'previous': previous, - 'form_icon': u'control_play_blue.png', - }, context_instance=RequestContext(request)) - - def submit_document_multiple(request): for item_id in request.GET.get('id_list', '').split(','): submit_document(request, item_id) diff --git a/docs/releases/0.13.rst b/docs/releases/0.13.rst index 5939db4665..ea737dfa9f 100644 --- a/docs/releases/0.13.rst +++ b/docs/releases/0.13.rst @@ -28,6 +28,7 @@ Overview #TODO: add clustering #TODO: add local jobs & distributed job processing #TODO: removal of DISABLE_HOME_VIEW +#TODO: removal of OCR_DEFAULT_NODE_CONCURRENT_EXECUTION, OCR_QUEUE_PROCESSING_INTERVAL What's new in Mayan EDMS v0.13 ============================== diff --git a/settings.py b/settings.py index 41c1552a49..6ac0ff15aa 100644 --- a/settings.py +++ b/settings.py @@ -178,7 +178,7 @@ INSTALLED_APPS = ( #'document_signatures', 'linking', 'metadata', - #'ocr', + 'ocr', 'main', #'installation', #'document_indexing', diff --git a/urls.py b/urls.py index 55a2cd9b1e..c7e841be89 100644 --- a/urls.py +++ b/urls.py @@ -15,7 +15,7 @@ urlpatterns = patterns('', (r'^documents/', include('documents.urls')), (r'^folders/', include('folders.urls')), (r'^search/', include('dynamic_search.urls')), - #(r'^ocr/', include('ocr.urls')), + (r'^ocr/', include('ocr.urls')), (r'^tags/', include('tags.urls')), (r'^comments/', include('document_comments.urls')), (r'^user_management/', include('user_management.urls')),