diff --git a/apps/documents/forms.py b/apps/documents/forms.py index 75df7bae0a..4b2b844016 100755 --- a/apps/documents/forms.py +++ b/apps/documents/forms.py @@ -72,10 +72,10 @@ class ImageWidget(forms.widgets.Widget): output.append('
%s' % ugettext(u'Click on the image for full size view')) - #for document_page in value.documentpage_set.all(): - # output.append('
%s)%s' % (document_page.page_number, - # reverse('document_page_view', args=[document_page.id]), - # ugettext(u'page view'))) + for document_page in value.documentpage_set.all(): + output.append('
%s)%s' % (document_page.page_number, + reverse('document_page_view', args=[document_page.id]), + ugettext(u'page view'))) #output.append(super(ImageWidget, self).render(name, value, attrs)) return mark_safe(u''.join(output)) diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py index 81ef308819..1460f2f210 100755 --- a/apps/ocr/__init__.py +++ b/apps/ocr/__init__.py @@ -1,21 +1,50 @@ +from multiprocessing import Queue + from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext +from django.db.utils import DatabaseError from common.api import register_links, register_menu from permissions.api import register_permissions from documents.models import Document -PERMISSION_OCR_DOCUMENT = 'ocr_document' +from models import DocumentQueue +from literals import QUEUEDOCUMENT_STATE_PROCESSING, \ + DOCUMENTQUEUE_STATE_STOPPED, QUEUEDOCUMENT_STATE_PENDING +from api import start_queue_watcher + +#Permissions +PERMISSION_OCR_DOCUMENT = 'ocr_document' register_permissions('ocr', [ {'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')}, ]) +#Links submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_DOCUMENT]}} - register_links(Document, [submit_document], menu_name='sidebar') +#Menus #register_menu([ # {'text':_('OCR'), 'view':'ocr_queue', 'links':[ # ocr_queue # ],'famfam':'hourglass','position':5}]) + + +try: + default_queue, created = DocumentQueue.objects.get_or_create(name='default') + if created: + default_queue.label = ugettext(u'Default') + default_queue.save() + + for queue in DocumentQueue.objects.all(): + queue.state = DOCUMENTQUEUE_STATE_STOPPED + queue.save() + start_queue_watcher(queue.name) + for document in queue.queuedocument_set.filter(state=QUEUEDOCUMENT_STATE_PROCESSING): + document.state = QUEUEDOCUMENT_STATE_PENDING + document.save() +except DatabaseError: + #syncdb + pass diff --git a/apps/ocr/admin.py b/apps/ocr/admin.py new file mode 100644 index 0000000000..b6bac71849 --- /dev/null +++ b/apps/ocr/admin.py @@ -0,0 +1,19 @@ +from django.contrib import admin + +from models import DocumentQueue, QueueDocument + + +class QueueDocumentInline(admin.StackedInline): + model = QueueDocument + extra = 1 + classes = ('collapse-open',) + allow_add = True + + +class DocumentQueueAdmin(admin.ModelAdmin): + inlines = [QueueDocumentInline] + list_directory = ('name', 'label', 'state') + + +admin.site.register(DocumentQueue, DocumentQueueAdmin) + diff --git a/apps/ocr/api.py b/apps/ocr/api.py index a0b6232db4..e215a245d2 100755 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -1,18 +1,29 @@ #Some code from http://wiki.github.com/hoffstaetter/python-tesseract import os +from multiprocessing import Process, Queue +from Queue import Empty import subprocess import tempfile from django.utils.translation import ugettext as _ +from django.contrib import messages -from documents.models import DocumentPage from common.conf.settings import TEMPORARY_DIRECTORY + +from documents.models import Document + from converter.api import convert_document_for_ocr from ocr.conf.settings import TESSERACT_PATH +from literals import QUEUEDOCUMENT_STATE_PROCESSING, \ + QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING + +from models import DocumentQueue + +queue_dict = {} def cleanup(filename): ''' tries to remove the given filename. Ignores non-existent files ''' @@ -46,7 +57,9 @@ def run_tesseract(input_filename, output_filename_base, lang=None): return (proc.wait(), proc.stderr.read()) -def ocr_document(document): +#def do_document_ocr(document): + +def do_document_ocr(document): for page_index, document_page in enumerate(document.documentpage_set.all()): imagefile = convert_document_for_ocr(document, page=page_index) desc, filepath = tempfile.mkstemp() @@ -60,8 +73,9 @@ def ocr_document(document): f = file(ocr_output) try: - document_page, created = DocumentPage.objects.get_or_create(document=document, - page_number=page_index+1) + #document_page, created = DocumentPage.objects.get_or_create(document=document, + # page_number=page_index+1) + document_page = document.documentpage_set.get(page_number=page_index+1) document_page.content = f.read().strip() document_page.page_label = _(u'Text from OCR') document_page.save() @@ -70,3 +84,63 @@ def ocr_document(document): cleanup(filepath) cleanup(ocr_output) cleanup(imagefile) + + +def do_queue_document(queue_document): + print 'do_queue_document' + queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING + queue_document.save() + + try: + do_document_ocr(queue_document.document) + queue_document.delete() + print 'ocr ended ok' + + except Exception, e: + print 'error', e + queue_document.state = QUEUEDOCUMENT_STATE_ERROR + queue_document.result = e + queue_document.save() + + + +def process_queue_document(queue_document): + #print 'process_queued_document' + #print 'test' ,queue_document.document.documentpage_set.all() + #print 'after' + d=Document.objects.get(id=42) + print d + print d.documentpage_set.all() + print 'after' + + p = Process(target=do_queue_document, args=(queue_document,)) + p.start() + + +def start_queue_watcher(queue_name): + + if queue_name in queue_dict: + print 'already started' + else: + queue_dict[queue_name] = Queue() + print 'start', queue_name + # if queue_name in queue_dict: + document_queue = DocumentQueue.objects.get(name=queue_name) + watcher = Process(target=queue_watcher, args=(document_queue,)) + watcher.start() + # else: + # raise Exception('No such queue: %s' % queue_name) + +import time +import sys +def queue_watcher(document_queue): + while True: + time.sleep(5) + try: + oldest_queued_document = document_queue.queuedocument_set.filter( + state=QUEUEDOCUMENT_STATE_PENDING).order_by('datetime_submitted')[0] + process_queue_document(oldest_queued_document) + print 'queue.get', oldest_queued_document + sys.stdout.flush() + except: + pass diff --git a/apps/ocr/literals.py b/apps/ocr/literals.py new file mode 100644 index 0000000000..a71e315435 --- /dev/null +++ b/apps/ocr/literals.py @@ -0,0 +1,21 @@ +from django.utils.translation import ugettext_lazy as _ + + +DOCUMENTQUEUE_STATE_STOPPED = 's' +DOCUMENTQUEUE_STATE_ACTIVE = 'a' + +DOCUMENTQUEUE_STATE_CHOICES = ( + (DOCUMENTQUEUE_STATE_STOPPED, _(u'stopped')), + (DOCUMENTQUEUE_STATE_ACTIVE, _(u'active')), +) + + +QUEUEDOCUMENT_STATE_PENDING = 'p' +QUEUEDOCUMENT_STATE_PROCESSING = 'i' +QUEUEDOCUMENT_STATE_ERROR = 'e' + +QUEUEDOCUMENT_STATE_CHOICES = ( + (QUEUEDOCUMENT_STATE_PENDING, _(u'pending')), + (QUEUEDOCUMENT_STATE_PROCESSING, _(u'processing')), + (QUEUEDOCUMENT_STATE_ERROR, _(u'error')), +) diff --git a/apps/ocr/models.py b/apps/ocr/models.py index 71a8362390..30e82591ee 100755 --- a/apps/ocr/models.py +++ b/apps/ocr/models.py @@ -1,3 +1,51 @@ from django.db import models +from django.utils.translation import ugettext_lazy as _ -# Create your models here. +from documents.models import Document + +from literals import DOCUMENTQUEUE_STATE_STOPPED,\ + DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING,\ + QUEUEDOCUMENT_STATE_CHOICES + + +#from api import queue_dict + + +class DocumentQueue(models.Model): + name = models.CharField(max_length=64, unique=True, verbose_name=_(u'name')) + label = models.CharField(max_length=64, verbose_name=_(u'label')) + state = models.CharField(max_length=4, + choices=DOCUMENTQUEUE_STATE_CHOICES, + default=DOCUMENTQUEUE_STATE_STOPPED, + verbose_name=_(u'state')) + + class Meta: + verbose_name = _(u'document queue') + verbose_name_plural = _(u'document queues') + + def __unicode__(self): + return self.label + +# def add_document(self, document): +# queue_document = QueueDocument(document_queue=self, document=document) +# queue_document.save() +# queue_dict[self.name].put(queue_document) + + +class QueueDocument(models.Model): + document_queue = models.ForeignKey(DocumentQueue, verbose_name=_(u'document queue')) + document = models.ForeignKey(Document, verbose_name=_(u'document')) + datetime_submitted = models.DateTimeField(verbose_name=_(u'date time submitted'), auto_now_add=True) + state = models.CharField(max_length=4, + choices=QUEUEDOCUMENT_STATE_CHOICES, + default=QUEUEDOCUMENT_STATE_PENDING, + verbose_name=_(u'state')) + result = models.TextField(blank=True, null=True, verbose_name=_(u'result')) + pid = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'process id')) + + class Meta: + verbose_name = _(u'queue document') + verbose_name_plural = _(u'queue documents') + + def __unicode__(self): + return unicode(self.document) diff --git a/apps/ocr/views.py b/apps/ocr/views.py index e37e1b9284..5f51841089 100755 --- a/apps/ocr/views.py +++ b/apps/ocr/views.py @@ -12,9 +12,11 @@ from permissions.api import check_permissions, Unauthorized from documents.models import Document from ocr import PERMISSION_OCR_DOCUMENT -from api import ocr_document -def submit_document(request, document_id): +from models import DocumentQueue, QueueDocument + + +def submit_document(request, document_id, queue_name='default'): permissions = [PERMISSION_OCR_DOCUMENT] try: check_permissions(request.user, 'ocr', permissions) @@ -23,11 +25,10 @@ def submit_document(request, document_id): document = get_object_or_404(Document, pk=document_id) - try: - result = ocr_document(document) - except Exception, e: - messages.error(request, e) - return HttpResponseRedirect(request.META['HTTP_REFERER']) - - messages.success(request, _(u'Document OCR was successful.')) + document_queue = get_object_or_404(DocumentQueue, name=queue_name) + #document_queue.add_document(document) + queue_document = QueueDocument(document_queue=document_queue, document=document) + queue_document.save() + + messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (document, document_queue.label)) return HttpResponseRedirect(request.META['HTTP_REFERER']) diff --git a/docs/TODO b/docs/TODO index 3b98897afb..390faf48d7 100755 --- a/docs/TODO +++ b/docs/TODO @@ -75,3 +75,4 @@ * Don't append an extension separator if extension is non existant * Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text * Storage backend to storage backend copy support, to move/migrate document to new storage backend +* Tesserat default option ocr setup