Made the concurrent ocr code more granular, per node, every node can handle different amounts of concurrent ocr tasks

This commit is contained in:
Roberto Rosario
2011-03-22 04:17:48 -04:00
parent d0942a203b
commit 3cb0f37b5b
6 changed files with 13 additions and 8 deletions

View File

@@ -78,8 +78,8 @@ def check_settings(request):
# OCR
{'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True},
{'name':'OCR_TESSERACT_LANGUAGE', 'value':ocr_settings.TESSERACT_LANGUAGE},
{'name':'OCR_MAX_CONCURRENT_EXECUTION', 'value':ocr_settings.MAX_CONCURRENT_EXECUTION},
{'name':'OCR_NODE_CONCURRENT_EXECUTION', 'value':ocr_settings.NODE_CONCURRENT_EXECUTION},
{'name':'OCR_REPLICATION_DELAY', 'value':ocr_settings.REPLICATION_DELAY},
# Search
{'name':'SEARCH_LIMIT', 'value':search_settings.LIMIT},

View File

@@ -2,5 +2,5 @@ from django.conf import settings
TESSERACT_PATH = getattr(settings, 'OCR_TESSERACT_PATH', u'/usr/bin/tesseract')
TESSERACT_LANGUAGE = getattr(settings, 'OCR_TESSERACT_LANGUAGE', None)
MAX_CONCURRENT_EXECUTION = getattr(settings, 'OCR_MAX_CONCURRENT_EXECUTION', 2)
REPLICATION_DELAY = getattr(settings, 'OCR_REPLICATION_DELAY', 10) #In seconds
NODE_CONCURRENT_EXECUTION = getattr(settings, 'OCR_NODE_CONCURRENT_EXECUTION', 1)

View File

@@ -51,6 +51,7 @@ class QueueDocument(models.Model):
default=QUEUEDOCUMENT_STATE_PENDING,
verbose_name=_(u'state'))
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
node_name = models.CharField(max_length=32, verbose_name=_(u'node name'), blank=True, null=True)
class Meta:
ordering = ('datetime_submitted',)

View File

@@ -1,4 +1,5 @@
from datetime import date, timedelta, datetime
import platform
from django.db.models import Q
@@ -12,7 +13,7 @@ from literals import QUEUEDOCUMENT_STATE_PENDING, \
QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_ACTIVE, \
QUEUEDOCUMENT_STATE_ERROR
from models import QueueDocument, DocumentQueue
from ocr.conf.settings import MAX_CONCURRENT_EXECUTION
from ocr.conf.settings import NODE_CONCURRENT_EXECUTION
from ocr.conf.settings import REPLICATION_DELAY
@@ -20,6 +21,7 @@ from ocr.conf.settings import REPLICATION_DELAY
def task_process_queue_document(queue_document_id):
queue_document = QueueDocument.objects.get(id=queue_document_id)
queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
queue_document.node_name = platform.node()
queue_document.save()
try:
do_document_ocr(queue_document.document)
@@ -42,8 +44,9 @@ class DocumentQueueWatcher(PeriodicTask):
q_delay_interval = Q(datetime_submitted__lt=datetime.now()-timedelta(seconds=REPLICATION_DELAY))
for document_queue in DocumentQueue.objects.filter(state=DOCUMENTQUEUE_STATE_ACTIVE):
logger.debug('Analysing queue: %s' % document_queue)
current_running_queues = QueueDocument.objects.filter(state=QUEUEDOCUMENT_STATE_PROCESSING).count()
if current_running_queues < MAX_CONCURRENT_EXECUTION:
if QueueDocument.objects.filter(
state=QUEUEDOCUMENT_STATE_PROCESSING).filter(
node_name=platform.node()).count() < NODE_CONCURRENT_EXECUTION:
try:
oldest_queued_document_qs = document_queue.queuedocument_set.filter(
(q_pending & ~q_delayed) | (q_pending & q_delayed & q_delay_interval))

View File

@@ -51,6 +51,7 @@ def queue_document_list(request, queue_name='default'):
{'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0], 'keep_together':True},
{'name':'delay', 'attribute':'delay'},
{'name':'state', 'attribute': lambda x: x.get_state_display()},
{'name':'node', 'attribute':'node_name'},
{'name':'result', 'attribute':'result'},
],
'multi_select_as_buttons':True,

View File

@@ -243,7 +243,7 @@ LOGIN_EXEMPT_URLS = (
# OCR
#OCR_TESSERACT_PATH = u'/usr/bin/tesseract'
#OCR_MAX_CONCURRENT_EXECUTION = 2
#OCR_NODE_CONCURRENT_EXECUTION = 1
#OCR_TESSERACT_LANGUAGE = None
#OCR_REPLICATION_DELAY = 10