Made the concurrent ocr code more granular, per node, every node can handle different amounts of concurrent ocr tasks
This commit is contained in:
@@ -78,9 +78,9 @@ def check_settings(request):
|
|||||||
# OCR
|
# OCR
|
||||||
{'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True},
|
{'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True},
|
||||||
{'name':'OCR_TESSERACT_LANGUAGE', 'value':ocr_settings.TESSERACT_LANGUAGE},
|
{'name':'OCR_TESSERACT_LANGUAGE', 'value':ocr_settings.TESSERACT_LANGUAGE},
|
||||||
{'name':'OCR_MAX_CONCURRENT_EXECUTION', 'value':ocr_settings.MAX_CONCURRENT_EXECUTION},
|
{'name':'OCR_NODE_CONCURRENT_EXECUTION', 'value':ocr_settings.NODE_CONCURRENT_EXECUTION},
|
||||||
|
{'name':'OCR_REPLICATION_DELAY', 'value':ocr_settings.REPLICATION_DELAY},
|
||||||
|
|
||||||
# Search
|
# Search
|
||||||
{'name':'SEARCH_LIMIT', 'value':search_settings.LIMIT},
|
{'name':'SEARCH_LIMIT', 'value':search_settings.LIMIT},
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -2,5 +2,5 @@ from django.conf import settings
|
|||||||
|
|
||||||
TESSERACT_PATH = getattr(settings, 'OCR_TESSERACT_PATH', u'/usr/bin/tesseract')
|
TESSERACT_PATH = getattr(settings, 'OCR_TESSERACT_PATH', u'/usr/bin/tesseract')
|
||||||
TESSERACT_LANGUAGE = getattr(settings, 'OCR_TESSERACT_LANGUAGE', None)
|
TESSERACT_LANGUAGE = getattr(settings, 'OCR_TESSERACT_LANGUAGE', None)
|
||||||
MAX_CONCURRENT_EXECUTION = getattr(settings, 'OCR_MAX_CONCURRENT_EXECUTION', 2)
|
|
||||||
REPLICATION_DELAY = getattr(settings, 'OCR_REPLICATION_DELAY', 10) #In seconds
|
REPLICATION_DELAY = getattr(settings, 'OCR_REPLICATION_DELAY', 10) #In seconds
|
||||||
|
NODE_CONCURRENT_EXECUTION = getattr(settings, 'OCR_NODE_CONCURRENT_EXECUTION', 1)
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ class QueueDocument(models.Model):
|
|||||||
default=QUEUEDOCUMENT_STATE_PENDING,
|
default=QUEUEDOCUMENT_STATE_PENDING,
|
||||||
verbose_name=_(u'state'))
|
verbose_name=_(u'state'))
|
||||||
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
|
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
|
||||||
|
node_name = models.CharField(max_length=32, verbose_name=_(u'node name'), blank=True, null=True)
|
||||||
|
|
||||||
class Meta:
|
class Meta:
|
||||||
ordering = ('datetime_submitted',)
|
ordering = ('datetime_submitted',)
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
from datetime import date, timedelta, datetime
|
from datetime import date, timedelta, datetime
|
||||||
|
import platform
|
||||||
|
|
||||||
from django.db.models import Q
|
from django.db.models import Q
|
||||||
|
|
||||||
@@ -12,7 +13,7 @@ from literals import QUEUEDOCUMENT_STATE_PENDING, \
|
|||||||
QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_ACTIVE, \
|
QUEUEDOCUMENT_STATE_PROCESSING, DOCUMENTQUEUE_STATE_ACTIVE, \
|
||||||
QUEUEDOCUMENT_STATE_ERROR
|
QUEUEDOCUMENT_STATE_ERROR
|
||||||
from models import QueueDocument, DocumentQueue
|
from models import QueueDocument, DocumentQueue
|
||||||
from ocr.conf.settings import MAX_CONCURRENT_EXECUTION
|
from ocr.conf.settings import NODE_CONCURRENT_EXECUTION
|
||||||
from ocr.conf.settings import REPLICATION_DELAY
|
from ocr.conf.settings import REPLICATION_DELAY
|
||||||
|
|
||||||
|
|
||||||
@@ -20,6 +21,7 @@ from ocr.conf.settings import REPLICATION_DELAY
|
|||||||
def task_process_queue_document(queue_document_id):
|
def task_process_queue_document(queue_document_id):
|
||||||
queue_document = QueueDocument.objects.get(id=queue_document_id)
|
queue_document = QueueDocument.objects.get(id=queue_document_id)
|
||||||
queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
|
queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
|
||||||
|
queue_document.node_name = platform.node()
|
||||||
queue_document.save()
|
queue_document.save()
|
||||||
try:
|
try:
|
||||||
do_document_ocr(queue_document.document)
|
do_document_ocr(queue_document.document)
|
||||||
@@ -42,8 +44,9 @@ class DocumentQueueWatcher(PeriodicTask):
|
|||||||
q_delay_interval = Q(datetime_submitted__lt=datetime.now()-timedelta(seconds=REPLICATION_DELAY))
|
q_delay_interval = Q(datetime_submitted__lt=datetime.now()-timedelta(seconds=REPLICATION_DELAY))
|
||||||
for document_queue in DocumentQueue.objects.filter(state=DOCUMENTQUEUE_STATE_ACTIVE):
|
for document_queue in DocumentQueue.objects.filter(state=DOCUMENTQUEUE_STATE_ACTIVE):
|
||||||
logger.debug('Analysing queue: %s' % document_queue)
|
logger.debug('Analysing queue: %s' % document_queue)
|
||||||
current_running_queues = QueueDocument.objects.filter(state=QUEUEDOCUMENT_STATE_PROCESSING).count()
|
if QueueDocument.objects.filter(
|
||||||
if current_running_queues < MAX_CONCURRENT_EXECUTION:
|
state=QUEUEDOCUMENT_STATE_PROCESSING).filter(
|
||||||
|
node_name=platform.node()).count() < NODE_CONCURRENT_EXECUTION:
|
||||||
try:
|
try:
|
||||||
oldest_queued_document_qs = document_queue.queuedocument_set.filter(
|
oldest_queued_document_qs = document_queue.queuedocument_set.filter(
|
||||||
(q_pending & ~q_delayed) | (q_pending & q_delayed & q_delay_interval))
|
(q_pending & ~q_delayed) | (q_pending & q_delayed & q_delay_interval))
|
||||||
|
|||||||
@@ -51,6 +51,7 @@ def queue_document_list(request, queue_name='default'):
|
|||||||
{'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0], 'keep_together':True},
|
{'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0], 'keep_together':True},
|
||||||
{'name':'delay', 'attribute':'delay'},
|
{'name':'delay', 'attribute':'delay'},
|
||||||
{'name':'state', 'attribute': lambda x: x.get_state_display()},
|
{'name':'state', 'attribute': lambda x: x.get_state_display()},
|
||||||
|
{'name':'node', 'attribute':'node_name'},
|
||||||
{'name':'result', 'attribute':'result'},
|
{'name':'result', 'attribute':'result'},
|
||||||
],
|
],
|
||||||
'multi_select_as_buttons':True,
|
'multi_select_as_buttons':True,
|
||||||
|
|||||||
@@ -243,7 +243,7 @@ LOGIN_EXEMPT_URLS = (
|
|||||||
|
|
||||||
# OCR
|
# OCR
|
||||||
#OCR_TESSERACT_PATH = u'/usr/bin/tesseract'
|
#OCR_TESSERACT_PATH = u'/usr/bin/tesseract'
|
||||||
#OCR_MAX_CONCURRENT_EXECUTION = 2
|
#OCR_NODE_CONCURRENT_EXECUTION = 1
|
||||||
#OCR_TESSERACT_LANGUAGE = None
|
#OCR_TESSERACT_LANGUAGE = None
|
||||||
#OCR_REPLICATION_DELAY = 10
|
#OCR_REPLICATION_DELAY = 10
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user