diff --git a/apps/documents/forms.py b/apps/documents/forms.py
index 75df7bae0a..4b2b844016 100755
--- a/apps/documents/forms.py
+++ b/apps/documents/forms.py
@@ -72,10 +72,10 @@ class ImageWidget(forms.widgets.Widget):
output.append('
%s' % ugettext(u'Click on the image for full size view'))
- #for document_page in value.documentpage_set.all():
- # output.append('
%s)%s' % (document_page.page_number,
- # reverse('document_page_view', args=[document_page.id]),
- # ugettext(u'page view')))
+ for document_page in value.documentpage_set.all():
+ output.append('
%s)%s' % (document_page.page_number,
+ reverse('document_page_view', args=[document_page.id]),
+ ugettext(u'page view')))
#output.append(super(ImageWidget, self).render(name, value, attrs))
return mark_safe(u''.join(output))
diff --git a/apps/ocr/__init__.py b/apps/ocr/__init__.py
index 81ef308819..1460f2f210 100755
--- a/apps/ocr/__init__.py
+++ b/apps/ocr/__init__.py
@@ -1,21 +1,50 @@
+from multiprocessing import Queue
+
from django.utils.translation import ugettext_lazy as _
+from django.utils.translation import ugettext
+from django.db.utils import DatabaseError
from common.api import register_links, register_menu
from permissions.api import register_permissions
from documents.models import Document
-PERMISSION_OCR_DOCUMENT = 'ocr_document'
+from models import DocumentQueue
+from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
+ DOCUMENTQUEUE_STATE_STOPPED, QUEUEDOCUMENT_STATE_PENDING
+from api import start_queue_watcher
+
+#Permissions
+PERMISSION_OCR_DOCUMENT = 'ocr_document'
register_permissions('ocr', [
{'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
])
+#Links
submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_DOCUMENT]}}
-
register_links(Document, [submit_document], menu_name='sidebar')
+#Menus
#register_menu([
# {'text':_('OCR'), 'view':'ocr_queue', 'links':[
# ocr_queue
# ],'famfam':'hourglass','position':5}])
+
+
+try:
+ default_queue, created = DocumentQueue.objects.get_or_create(name='default')
+ if created:
+ default_queue.label = ugettext(u'Default')
+ default_queue.save()
+
+ for queue in DocumentQueue.objects.all():
+ queue.state = DOCUMENTQUEUE_STATE_STOPPED
+ queue.save()
+ start_queue_watcher(queue.name)
+ for document in queue.queuedocument_set.filter(state=QUEUEDOCUMENT_STATE_PROCESSING):
+ document.state = QUEUEDOCUMENT_STATE_PENDING
+ document.save()
+except DatabaseError:
+ #syncdb
+ pass
diff --git a/apps/ocr/admin.py b/apps/ocr/admin.py
new file mode 100644
index 0000000000..b6bac71849
--- /dev/null
+++ b/apps/ocr/admin.py
@@ -0,0 +1,19 @@
+from django.contrib import admin
+
+from models import DocumentQueue, QueueDocument
+
+
+class QueueDocumentInline(admin.StackedInline):
+ model = QueueDocument
+ extra = 1
+ classes = ('collapse-open',)
+ allow_add = True
+
+
+class DocumentQueueAdmin(admin.ModelAdmin):
+ inlines = [QueueDocumentInline]
+ list_directory = ('name', 'label', 'state')
+
+
+admin.site.register(DocumentQueue, DocumentQueueAdmin)
+
diff --git a/apps/ocr/api.py b/apps/ocr/api.py
index a0b6232db4..e215a245d2 100755
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -1,18 +1,29 @@
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
import os
+from multiprocessing import Process, Queue
+from Queue import Empty
import subprocess
import tempfile
from django.utils.translation import ugettext as _
+from django.contrib import messages
-from documents.models import DocumentPage
from common.conf.settings import TEMPORARY_DIRECTORY
+
+from documents.models import Document
+
from converter.api import convert_document_for_ocr
from ocr.conf.settings import TESSERACT_PATH
+from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
+ QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING
+
+from models import DocumentQueue
+
+queue_dict = {}
def cleanup(filename):
''' tries to remove the given filename. Ignores non-existent files '''
@@ -46,7 +57,9 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
return (proc.wait(), proc.stderr.read())
-def ocr_document(document):
+#def do_document_ocr(document):
+
+def do_document_ocr(document):
for page_index, document_page in enumerate(document.documentpage_set.all()):
imagefile = convert_document_for_ocr(document, page=page_index)
desc, filepath = tempfile.mkstemp()
@@ -60,8 +73,9 @@ def ocr_document(document):
f = file(ocr_output)
try:
- document_page, created = DocumentPage.objects.get_or_create(document=document,
- page_number=page_index+1)
+ #document_page, created = DocumentPage.objects.get_or_create(document=document,
+ # page_number=page_index+1)
+ document_page = document.documentpage_set.get(page_number=page_index+1)
document_page.content = f.read().strip()
document_page.page_label = _(u'Text from OCR')
document_page.save()
@@ -70,3 +84,63 @@ def ocr_document(document):
cleanup(filepath)
cleanup(ocr_output)
cleanup(imagefile)
+
+
+def do_queue_document(queue_document):
+ print 'do_queue_document'
+ queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
+ queue_document.save()
+
+ try:
+ do_document_ocr(queue_document.document)
+ queue_document.delete()
+ print 'ocr ended ok'
+
+ except Exception, e:
+ print 'error', e
+ queue_document.state = QUEUEDOCUMENT_STATE_ERROR
+ queue_document.result = e
+ queue_document.save()
+
+
+
+def process_queue_document(queue_document):
+ #print 'process_queued_document'
+ #print 'test' ,queue_document.document.documentpage_set.all()
+ #print 'after'
+ d=Document.objects.get(id=42)
+ print d
+ print d.documentpage_set.all()
+ print 'after'
+
+ p = Process(target=do_queue_document, args=(queue_document,))
+ p.start()
+
+
+def start_queue_watcher(queue_name):
+
+ if queue_name in queue_dict:
+ print 'already started'
+ else:
+ queue_dict[queue_name] = Queue()
+ print 'start', queue_name
+ # if queue_name in queue_dict:
+ document_queue = DocumentQueue.objects.get(name=queue_name)
+ watcher = Process(target=queue_watcher, args=(document_queue,))
+ watcher.start()
+ # else:
+ # raise Exception('No such queue: %s' % queue_name)
+
+import time
+import sys
+def queue_watcher(document_queue):
+ while True:
+ time.sleep(5)
+ try:
+ oldest_queued_document = document_queue.queuedocument_set.filter(
+ state=QUEUEDOCUMENT_STATE_PENDING).order_by('datetime_submitted')[0]
+ process_queue_document(oldest_queued_document)
+ print 'queue.get', oldest_queued_document
+ sys.stdout.flush()
+ except:
+ pass
diff --git a/apps/ocr/literals.py b/apps/ocr/literals.py
new file mode 100644
index 0000000000..a71e315435
--- /dev/null
+++ b/apps/ocr/literals.py
@@ -0,0 +1,21 @@
+from django.utils.translation import ugettext_lazy as _
+
+
+DOCUMENTQUEUE_STATE_STOPPED = 's'
+DOCUMENTQUEUE_STATE_ACTIVE = 'a'
+
+DOCUMENTQUEUE_STATE_CHOICES = (
+ (DOCUMENTQUEUE_STATE_STOPPED, _(u'stopped')),
+ (DOCUMENTQUEUE_STATE_ACTIVE, _(u'active')),
+)
+
+
+QUEUEDOCUMENT_STATE_PENDING = 'p'
+QUEUEDOCUMENT_STATE_PROCESSING = 'i'
+QUEUEDOCUMENT_STATE_ERROR = 'e'
+
+QUEUEDOCUMENT_STATE_CHOICES = (
+ (QUEUEDOCUMENT_STATE_PENDING, _(u'pending')),
+ (QUEUEDOCUMENT_STATE_PROCESSING, _(u'processing')),
+ (QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
+)
diff --git a/apps/ocr/models.py b/apps/ocr/models.py
index 71a8362390..30e82591ee 100755
--- a/apps/ocr/models.py
+++ b/apps/ocr/models.py
@@ -1,3 +1,51 @@
from django.db import models
+from django.utils.translation import ugettext_lazy as _
-# Create your models here.
+from documents.models import Document
+
+from literals import DOCUMENTQUEUE_STATE_STOPPED,\
+ DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING,\
+ QUEUEDOCUMENT_STATE_CHOICES
+
+
+#from api import queue_dict
+
+
+class DocumentQueue(models.Model):
+ name = models.CharField(max_length=64, unique=True, verbose_name=_(u'name'))
+ label = models.CharField(max_length=64, verbose_name=_(u'label'))
+ state = models.CharField(max_length=4,
+ choices=DOCUMENTQUEUE_STATE_CHOICES,
+ default=DOCUMENTQUEUE_STATE_STOPPED,
+ verbose_name=_(u'state'))
+
+ class Meta:
+ verbose_name = _(u'document queue')
+ verbose_name_plural = _(u'document queues')
+
+ def __unicode__(self):
+ return self.label
+
+# def add_document(self, document):
+# queue_document = QueueDocument(document_queue=self, document=document)
+# queue_document.save()
+# queue_dict[self.name].put(queue_document)
+
+
+class QueueDocument(models.Model):
+ document_queue = models.ForeignKey(DocumentQueue, verbose_name=_(u'document queue'))
+ document = models.ForeignKey(Document, verbose_name=_(u'document'))
+ datetime_submitted = models.DateTimeField(verbose_name=_(u'date time submitted'), auto_now_add=True)
+ state = models.CharField(max_length=4,
+ choices=QUEUEDOCUMENT_STATE_CHOICES,
+ default=QUEUEDOCUMENT_STATE_PENDING,
+ verbose_name=_(u'state'))
+ result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
+ pid = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'process id'))
+
+ class Meta:
+ verbose_name = _(u'queue document')
+ verbose_name_plural = _(u'queue documents')
+
+ def __unicode__(self):
+ return unicode(self.document)
diff --git a/apps/ocr/views.py b/apps/ocr/views.py
index e37e1b9284..5f51841089 100755
--- a/apps/ocr/views.py
+++ b/apps/ocr/views.py
@@ -12,9 +12,11 @@ from permissions.api import check_permissions, Unauthorized
from documents.models import Document
from ocr import PERMISSION_OCR_DOCUMENT
-from api import ocr_document
-def submit_document(request, document_id):
+from models import DocumentQueue, QueueDocument
+
+
+def submit_document(request, document_id, queue_name='default'):
permissions = [PERMISSION_OCR_DOCUMENT]
try:
check_permissions(request.user, 'ocr', permissions)
@@ -23,11 +25,10 @@ def submit_document(request, document_id):
document = get_object_or_404(Document, pk=document_id)
- try:
- result = ocr_document(document)
- except Exception, e:
- messages.error(request, e)
- return HttpResponseRedirect(request.META['HTTP_REFERER'])
-
- messages.success(request, _(u'Document OCR was successful.'))
+ document_queue = get_object_or_404(DocumentQueue, name=queue_name)
+ #document_queue.add_document(document)
+ queue_document = QueueDocument(document_queue=document_queue, document=document)
+ queue_document.save()
+
+ messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (document, document_queue.label))
return HttpResponseRedirect(request.META['HTTP_REFERER'])
diff --git a/docs/TODO b/docs/TODO
index 3b98897afb..390faf48d7 100755
--- a/docs/TODO
+++ b/docs/TODO
@@ -75,3 +75,4 @@
* Don't append an extension separator if extension is non existant
* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
* Storage backend to storage backend copy support, to move/migrate document to new storage backend
+* Tesserat default option ocr setup