From 77b8a432a2f0b0e72de3644b5318342f8a20c432 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 17 Feb 2011 04:37:35 -0400 Subject: [PATCH] Added distributed OCR queue support --- README.md | 3 +++ apps/documents/__init__.py | 2 +- apps/documents/conf/settings.py | 1 + apps/documents/views.py | 10 ++++++++- apps/dynamic_search/__init__.py | 2 +- apps/main/views.py | 2 ++ apps/ocr/__init__.py | 9 ++++----- apps/ocr/api.py | 5 ----- apps/ocr/models.py | 15 +++++++++----- apps/ocr/urls.py | 1 + apps/ocr/views.py | 36 +++++++++++++++++++++++++-------- docs/TODO | 4 ++++ settings.py | 2 ++ 13 files changed, 66 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 8561c7e094..58fb1e87a4 100755 --- a/README.md +++ b/README.md @@ -32,6 +32,8 @@ Python: * Django - A high-level Python Web framework that encourages rapid development and clean, pragmatic design. * django-pagination * django-filetransfers - File upload/download abstraction +* django-celery +* celery Or execute pip install -r requirements/production.txt to install the dependencies automatically. @@ -42,6 +44,7 @@ Executables: * tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. * unpaper - post-processing scanned and photocopied book pages + License ------- See docs/LICENSE file diff --git a/apps/documents/__init__.py b/apps/documents/__init__.py index a713be3bad..9c65cd9800 100755 --- a/apps/documents/__init__.py +++ b/apps/documents/__init__.py @@ -76,6 +76,6 @@ register_model_list_columns(Document, [ register_menu([ {'text':_('documents'), 'view':'document_create', 'links':[ document_create, document_create_multiple, document_list - ],'famfam':'page','position':4}]) + ],'famfam':'page','position':1}]) TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp() diff --git a/apps/documents/conf/settings.py b/apps/documents/conf/settings.py index a07fce99b8..055ff5ef8a 100755 --- a/apps/documents/conf/settings.py +++ b/apps/documents/conf/settings.py @@ -34,6 +34,7 @@ STAGING_DIRECTORY = getattr(settings, 'DOCUMENTS_STAGING_DIRECTORY', u'/tmp/maya DELETE_STAGING_FILE_AFTER_UPLOAD = getattr(settings, 'DOCUMENTS_DELETE_STAGING_FILE_AFTER_UPLOAD', False) STAGING_FILES_PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_STAGING_FILES_PREVIEW_SIZE', '640x480') DELETE_LOCAL_ORIGINAL = getattr(settings, 'DOCUMENTS_DELETE_LOCAL_ORIGINAL', False) +AUTOMATIC_OCR = getattr(settings, 'DOCUMENTS_AUTOMATIC_OCR', False) # Saving CHECKSUM_FUNCTION = getattr(settings, 'DOCUMENTS_CHECKSUM_FUNCTION', lambda x: hashlib.sha256(x).hexdigest()) diff --git a/apps/documents/views.py b/apps/documents/views.py index 6012d72050..a47fdfc911 100755 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -30,6 +30,8 @@ from forms import DocumentTypeSelectForm, DocumentCreateWizard, \ from staging import StagingFile +from ocr.models import add_document_to_queue + from documents.conf.settings import DELETE_STAGING_FILE_AFTER_UPLOAD from documents.conf.settings import USE_STAGING_DIRECTORY from documents.conf.settings import FILESYSTEM_FILESERVING_ENABLE @@ -39,6 +41,7 @@ from documents.conf.settings import THUMBNAIL_SIZE from documents.conf.settings import GROUP_MAX_RESULTS from documents.conf.settings import GROUP_SHOW_EMPTY from documents.conf.settings import DEFAULT_TRANSFORMATIONS +from documents.conf.settings import AUTOMATIC_OCR from documents import PERMISSION_DOCUMENT_CREATE, \ @@ -130,6 +133,9 @@ def upload_document_with_type(request, document_type_id, multiple=True): instance.update_mimetype() instance.update_page_count() instance.apply_default_transformations() + if AUTOMATIC_OCR: + document_queue = add_document_to_queue(instance) + messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (instance, document_queue.label)) if 'document_type_available_filenames' in local_form.cleaned_data: if local_form.cleaned_data['document_type_available_filenames']: @@ -165,7 +171,9 @@ def upload_document_with_type(request, document_type_id, multiple=True): document.update_mimetype() document.update_page_count() document.apply_default_transformations() - + if AUTOMATIC_OCR: + document_queue = add_document_to_queue(instance) + messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (instance, document_queue.label)) except Exception, e: messages.error(request, e) else: diff --git a/apps/dynamic_search/__init__.py b/apps/dynamic_search/__init__.py index 82bdbfac9a..a4333c28f0 100755 --- a/apps/dynamic_search/__init__.py +++ b/apps/dynamic_search/__init__.py @@ -2,5 +2,5 @@ from django.utils.translation import ugettext_lazy as _ from common.api import register_menu register_menu([ - {'text':_(u'search'), 'view':'search', 'famfam':'zoom', 'position':5}, + {'text':_(u'search'), 'view':'search', 'famfam':'zoom', 'position':2}, ]) diff --git a/apps/main/views.py b/apps/main/views.py index 4e8cc6112e..0e32786adc 100755 --- a/apps/main/views.py +++ b/apps/main/views.py @@ -35,6 +35,7 @@ def check_settings(request): {'name':'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', 'value':documents_settings.FILESYSTEM_FILESERVING_PATH, 'exists':True}, {'name':'DOCUMENTS_SLUGIFY_PATHS', 'value':documents_settings.FILESYSTEM_SLUGIFY_PATHS}, {'name':'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 'value':documents_settings.FILESYSTEM_MAX_RENAME_COUNT}, + {'name':'DOCUMENTS_AUTOMATIC_OCR', 'value':documents_settings.AUTOMATIC_OCR}, #Common {'name':'COMMON_TEMPORARY_DIRECTORY', 'value':common_settings.TEMPORARY_DIRECTORY, 'exists':True}, @@ -45,6 +46,7 @@ def check_settings(request): #ocr {'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True}, + {'name':'OCR_MAX_CONCURRENT_EXECUTION', 'value':ocr_settings.MAX_CONCURRENT_EXECUTION}, ] context={ diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py index ea655b8296..09ec87e8ac 100755 --- a/apps/ocr/__init__.py +++ b/apps/ocr/__init__.py @@ -25,11 +25,10 @@ submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'a register_links(Document, [submit_document], menu_name='sidebar') #Menus -#register_menu([ -# {'text':_('OCR'), 'view':'ocr_queue', 'links':[ -# ocr_queue -# ],'famfam':'hourglass','position':5}]) - +register_menu([ + {'text':_('OCR'), 'view':'queue_document_list', 'links':[ + #ocr_queue + ],'famfam':'hourglass','position':4}]) try: default_queue, created = DocumentQueue.objects.get_or_create(name='default') diff --git a/apps/ocr/api.py b/apps/ocr/api.py index c0c73306b7..57b1bb54eb 100755 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -15,11 +15,6 @@ from converter.api import convert_document_for_ocr from ocr.conf.settings import TESSERACT_PATH -#from literals import QUEUEDOCUMENT_STATE_PROCESSING, \ -# QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING - -#from models import DocumentQueue - def cleanup(filename): ''' tries to remove the given filename. Ignores non-existent files ''' diff --git a/apps/ocr/models.py b/apps/ocr/models.py index 03d9b837bb..fb696a161f 100755 --- a/apps/ocr/models.py +++ b/apps/ocr/models.py @@ -6,6 +6,13 @@ from documents.models import Document from literals import DOCUMENTQUEUE_STATE_STOPPED,\ DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING,\ QUEUEDOCUMENT_STATE_CHOICES + + +def add_document_to_queue(document, queue_name='default'): + document_queue = DocumentQueue.objects.get(name=queue_name) + queue_document = QueueDocument(document_queue=document_queue, document=document) + queue_document.save() + return document_queue class DocumentQueue(models.Model): @@ -23,10 +30,9 @@ class DocumentQueue(models.Model): def __unicode__(self): return self.label -# def add_document(self, document): -# queue_document = QueueDocument(document_queue=self, document=document) -# queue_document.save() -# queue_dict[self.name].put(queue_document) + def add_document(self, document): + queue_document = QueueDocument(document_queue=self, document=document) + queue_document.save() class QueueDocument(models.Model): @@ -38,7 +44,6 @@ class QueueDocument(models.Model): default=QUEUEDOCUMENT_STATE_PENDING, verbose_name=_(u'state')) result = models.TextField(blank=True, null=True, verbose_name=_(u'result')) - pid = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'process id')) class Meta: verbose_name = _(u'queue document') diff --git a/apps/ocr/urls.py b/apps/ocr/urls.py index 39dc2a35ad..9f9ec49011 100755 --- a/apps/ocr/urls.py +++ b/apps/ocr/urls.py @@ -3,4 +3,5 @@ from django.utils.translation import ugettext_lazy as _ urlpatterns = patterns('ocr.views', url(r'^(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), + url(r'^ocr/queue/document/list/$', 'queue_document_list', (), 'queue_document_list'), ) diff --git a/apps/ocr/views.py b/apps/ocr/views.py index f39b902cc7..344cb8aaeb 100755 --- a/apps/ocr/views.py +++ b/apps/ocr/views.py @@ -13,10 +13,36 @@ from documents.models import Document from ocr import PERMISSION_OCR_DOCUMENT -from models import DocumentQueue, QueueDocument +from models import DocumentQueue, QueueDocument, add_document_to_queue from tasks import do_document_ocr_task + +def queue_document_list(request, queue_name='default'): + permissions = [PERMISSION_OCR_DOCUMENT] + try: + check_permissions(request.user, 'ocr', permissions) + except Unauthorized, e: + raise Http404(e) + + document_queue = get_object_or_404(DocumentQueue, name=queue_name) + + return object_list( + request, + queryset=document_queue.queuedocument_set.all(), + template_name='generic_list.html', + extra_context={ + 'title':_(u'queued documents'), + 'hide_object':True, + 'extra_columns':[ + {'name':'document', 'attribute': 'document'}, + {'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0]}, + {'name':'state', 'attribute': lambda x: x.get_state_display()}, + ], + }, + ) + + def submit_document(request, document_id, queue_name='default'): permissions = [PERMISSION_OCR_DOCUMENT] try: @@ -27,13 +53,7 @@ def submit_document(request, document_id, queue_name='default'): document = get_object_or_404(Document, pk=document_id) document_queue = get_object_or_404(DocumentQueue, name=queue_name) - do_document_ocr_task.delay(document.id) - ##document_queue.add_document(document) - #queue_document = QueueDocument(document_queue=document_queue, document=document) - #queue_document.save() - - - #add.delay(1,2) + add_document_to_queue(document, document_queue.name) messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (document, document_queue.label)) return HttpResponseRedirect(request.META['HTTP_REFERER']) diff --git a/docs/TODO b/docs/TODO index 52d43c5cc0..5fa2dc4748 100755 --- a/docs/TODO +++ b/docs/TODO @@ -75,3 +75,7 @@ * Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text * Storage backend to storage backend copy support, to move/migrate document to new storage backend * Tesserat default option ocr setup +* Do separate default transformations for staging and for local uploads +* Multiple ocr queue support +* Enable/disable ocr queue view & links +* Restrict view permission free form rename diff --git a/settings.py b/settings.py index c218b573f9..ed2d2af436 100755 --- a/settings.py +++ b/settings.py @@ -179,6 +179,8 @@ LOGIN_EXEMPT_URLS = ( #DOCUMENTS_STAGING_DIRECTORY = u'/tmp/mayan/staging' #DOCUMENTS_DELETE_STAGING_FILE_AFTER_UPLOAD = False #DOCUMENTS_STAGING_FILES_PREVIEW_SIZE = '640x480' +#DOCUMENTS_AUTOMATIC_OCR = False + # Saving #DOCUMENTS_CHECKSUM_FUNCTION = lambda x: hashlib.sha256(x).hexdigest())