Changed from python's multiprocessing to celery to handle concurrency

2011-02-17 03:45:30 -04:00
parent 409a52af95
commit 478fb3502e
13 changed files with 102 additions and 87 deletions
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -1,9 +1,6 @@
 #Some code from http://wiki.github.com/hoffstaetter/python-tesseract

 import os
-from multiprocessing import Process, Queue
-from Queue import Empty
-
 import subprocess
 import tempfile

@@ -18,12 +15,11 @@ from converter.api import convert_document_for_ocr

 from ocr.conf.settings import TESSERACT_PATH

-from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
-    QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING
+#from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
+#    QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING

-from models import DocumentQueue
+#from models import DocumentQueue

-queue_dict = {}

 def cleanup(filename):
    ''' tries to remove the given filename. Ignores non-existent files '''
@@ -57,8 +53,6 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
    return (proc.wait(), proc.stderr.read())


-#def do_document_ocr(document):
-
 def do_document_ocr(document):
    for page_index, document_page in enumerate(document.documentpage_set.all()):    
        imagefile = convert_document_for_ocr(document, page=page_index)
@@ -73,8 +67,6 @@ def do_document_ocr(document):

        f = file(ocr_output)
        try:
-            #document_page, created = DocumentPage.objects.get_or_create(document=document,
-            #    page_number=page_index+1)
            document_page = document.documentpage_set.get(page_number=page_index+1)
            document_page.content = f.read().strip()
            document_page.page_label = _(u'Text from OCR')
@@ -84,63 +76,3 @@ def do_document_ocr(document):
            cleanup(filepath)
            cleanup(ocr_output)
            cleanup(imagefile)
-
-
-def do_queue_document(queue_document):
-    print 'do_queue_document'
-    queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
-    queue_document.save()
-
-    try:
-        do_document_ocr(queue_document.document)
-        queue_document.delete()
-        print 'ocr ended ok'
-
-    except Exception, e:
-        print 'error', e
-        queue_document.state = QUEUEDOCUMENT_STATE_ERROR
-        queue_document.result = e
-        queue_document.save()
-        
-
-
-def process_queue_document(queue_document):
-    #print 'process_queued_document'
-    #print 'test' ,queue_document.document.documentpage_set.all()    
-    #print 'after'
-    d=Document.objects.get(id=42)
-    print d
-    print d.documentpage_set.all()
-    print 'after'
-    
-    p = Process(target=do_queue_document, args=(queue_document,))
-    p.start()
-    
-
-def start_queue_watcher(queue_name):
-
-    if queue_name in queue_dict:
-        print 'already started'
-    else:
-        queue_dict[queue_name] = Queue()
-        print 'start', queue_name
-    #    if queue_name in queue_dict:
-        document_queue = DocumentQueue.objects.get(name=queue_name)
-        watcher = Process(target=queue_watcher, args=(document_queue,))
-        watcher.start()
-    #    else:
-    #        raise Exception('No such queue: %s' % queue_name)
-
-import time
-import sys
-def queue_watcher(document_queue):
-    while True:
-        time.sleep(5)
-        try:
-            oldest_queued_document = document_queue.queuedocument_set.filter(
-                state=QUEUEDOCUMENT_STATE_PENDING).order_by('datetime_submitted')[0]
-            process_queue_document(oldest_queued_document)
-            print 'queue.get', oldest_queued_document
-            sys.stdout.flush()
-        except:
-            pass