Made the concurrent ocr code more granular, per node, every node can handle different amounts of concurrent ocr tasks
This commit is contained in:
@@ -78,9 +78,9 @@ def check_settings(request):
|
||||
# OCR
|
||||
{'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True},
|
||||
{'name':'OCR_TESSERACT_LANGUAGE', 'value':ocr_settings.TESSERACT_LANGUAGE},
|
||||
{'name':'OCR_MAX_CONCURRENT_EXECUTION', 'value':ocr_settings.MAX_CONCURRENT_EXECUTION},
|
||||
|
||||
|
||||
{'name':'OCR_NODE_CONCURRENT_EXECUTION', 'value':ocr_settings.NODE_CONCURRENT_EXECUTION},
|
||||
{'name':'OCR_REPLICATION_DELAY', 'value':ocr_settings.REPLICATION_DELAY},
|
||||
|
||||
# Search
|
||||
{'name':'SEARCH_LIMIT', 'value':search_settings.LIMIT},
|
||||
]
|
||||
|
||||
@@ -2,5 +2,5 @@ from django.conf import settings
|
||||
|
||||
TESSERACT_PATH = getattr(settings, 'OCR_TESSERACT_PATH', u'/usr/bin/tesseract')
|
||||
TESSERACT_LANGUAGE = getattr(settings, 'OCR_TESSERACT_LANGUAGE', None)
|
||||
MAX_CONCURRENT_EXECUTION = getattr(settings, 'OCR_MAX_CONCURRENT_EXECUTION', 2)
|
||||
REPLICATION_DELAY = getattr(settings, 'OCR_REPLICATION_DELAY', 10) #In seconds
|
||||
NODE_CONCURRENT_EXECUTION = getattr(settings, 'OCR_NODE_CONCURRENT_EXECUTION', 1)
|
||||
|
||||
@@ -51,6 +51,7 @@ class QueueDocument(models.Model):
|
||||
default=QUEUEDOCUMENT_STATE_PENDING,
|
||||
verbose_name=_(u'state'))
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
|
||||
node_name = models.CharField(max_length=32, verbose_name=_(u'node name'), blank=True, null=True)
|
||||
|
||||
class Meta:
|
||||
ordering = ('datetime_submitted',)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from datetime import date, timedelta, datetime
|
||||
import platform
|
||||
|
||||
from django.db.models import Q
|
||||
|
||||
@@ -12,7 +13,7 @@ from literals import QUEUEDOCUMENT_STATE_PENDING, \
|
||||
QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_ACTIVE, \
|
||||
QUEUEDOCUMENT_STATE_ERROR
|
||||
from models import QueueDocument, DocumentQueue
|
||||
from ocr.conf.settings import MAX_CONCURRENT_EXECUTION
|
||||
from ocr.conf.settings import NODE_CONCURRENT_EXECUTION
|
||||
from ocr.conf.settings import REPLICATION_DELAY
|
||||
|
||||
|
||||
@@ -20,6 +21,7 @@ from ocr.conf.settings import REPLICATION_DELAY
|
||||
def task_process_queue_document(queue_document_id):
|
||||
queue_document = QueueDocument.objects.get(id=queue_document_id)
|
||||
queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
|
||||
queue_document.node_name = platform.node()
|
||||
queue_document.save()
|
||||
try:
|
||||
do_document_ocr(queue_document.document)
|
||||
@@ -42,8 +44,9 @@ class DocumentQueueWatcher(PeriodicTask):
|
||||
q_delay_interval = Q(datetime_submitted__lt=datetime.now()-timedelta(seconds=REPLICATION_DELAY))
|
||||
for document_queue in DocumentQueue.objects.filter(state=DOCUMENTQUEUE_STATE_ACTIVE):
|
||||
logger.debug('Analysing queue: %s' % document_queue)
|
||||
current_running_queues = QueueDocument.objects.filter(state=QUEUEDOCUMENT_STATE_PROCESSING).count()
|
||||
if current_running_queues < MAX_CONCURRENT_EXECUTION:
|
||||
if QueueDocument.objects.filter(
|
||||
state=QUEUEDOCUMENT_STATE_PROCESSING).filter(
|
||||
node_name=platform.node()).count() < NODE_CONCURRENT_EXECUTION:
|
||||
try:
|
||||
oldest_queued_document_qs = document_queue.queuedocument_set.filter(
|
||||
(q_pending & ~q_delayed) | (q_pending & q_delayed & q_delay_interval))
|
||||
|
||||
@@ -51,6 +51,7 @@ def queue_document_list(request, queue_name='default'):
|
||||
{'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0], 'keep_together':True},
|
||||
{'name':'delay', 'attribute':'delay'},
|
||||
{'name':'state', 'attribute': lambda x: x.get_state_display()},
|
||||
{'name':'node', 'attribute':'node_name'},
|
||||
{'name':'result', 'attribute':'result'},
|
||||
],
|
||||
'multi_select_as_buttons':True,
|
||||
|
||||
@@ -243,7 +243,7 @@ LOGIN_EXEMPT_URLS = (
|
||||
|
||||
# OCR
|
||||
#OCR_TESSERACT_PATH = u'/usr/bin/tesseract'
|
||||
#OCR_MAX_CONCURRENT_EXECUTION = 2
|
||||
#OCR_NODE_CONCURRENT_EXECUTION = 1
|
||||
#OCR_TESSERACT_LANGUAGE = None
|
||||
#OCR_REPLICATION_DELAY = 10
|
||||
|
||||
|
||||
Reference in New Issue
Block a user