Added distributed OCR queue support

This commit is contained in:
Roberto Rosario
2011-02-17 04:37:35 -04:00
parent 478fb3502e
commit 77b8a432a2
13 changed files with 66 additions and 26 deletions

View File

@@ -32,6 +32,8 @@ Python:
* Django - A high-level Python Web framework that encourages rapid development and clean, pragmatic design.
* django-pagination
* django-filetransfers - File upload/download abstraction
* django-celery
* celery
Or execute pip install -r requirements/production.txt to install the dependencies automatically.
@@ -42,6 +44,7 @@ Executables:
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
* unpaper - post-processing scanned and photocopied book pages
License
-------
See docs/LICENSE file

View File

@@ -76,6 +76,6 @@ register_model_list_columns(Document, [
register_menu([
{'text':_('documents'), 'view':'document_create', 'links':[
document_create, document_create_multiple, document_list
],'famfam':'page','position':4}])
],'famfam':'page','position':1}])
TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp()

View File

@@ -34,6 +34,7 @@ STAGING_DIRECTORY = getattr(settings, 'DOCUMENTS_STAGING_DIRECTORY', u'/tmp/maya
DELETE_STAGING_FILE_AFTER_UPLOAD = getattr(settings, 'DOCUMENTS_DELETE_STAGING_FILE_AFTER_UPLOAD', False)
STAGING_FILES_PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_STAGING_FILES_PREVIEW_SIZE', '640x480')
DELETE_LOCAL_ORIGINAL = getattr(settings, 'DOCUMENTS_DELETE_LOCAL_ORIGINAL', False)
AUTOMATIC_OCR = getattr(settings, 'DOCUMENTS_AUTOMATIC_OCR', False)
# Saving
CHECKSUM_FUNCTION = getattr(settings, 'DOCUMENTS_CHECKSUM_FUNCTION', lambda x: hashlib.sha256(x).hexdigest())

View File

@@ -30,6 +30,8 @@ from forms import DocumentTypeSelectForm, DocumentCreateWizard, \
from staging import StagingFile
from ocr.models import add_document_to_queue
from documents.conf.settings import DELETE_STAGING_FILE_AFTER_UPLOAD
from documents.conf.settings import USE_STAGING_DIRECTORY
from documents.conf.settings import FILESYSTEM_FILESERVING_ENABLE
@@ -39,6 +41,7 @@ from documents.conf.settings import THUMBNAIL_SIZE
from documents.conf.settings import GROUP_MAX_RESULTS
from documents.conf.settings import GROUP_SHOW_EMPTY
from documents.conf.settings import DEFAULT_TRANSFORMATIONS
from documents.conf.settings import AUTOMATIC_OCR
from documents import PERMISSION_DOCUMENT_CREATE, \
@@ -130,6 +133,9 @@ def upload_document_with_type(request, document_type_id, multiple=True):
instance.update_mimetype()
instance.update_page_count()
instance.apply_default_transformations()
if AUTOMATIC_OCR:
document_queue = add_document_to_queue(instance)
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (instance, document_queue.label))
if 'document_type_available_filenames' in local_form.cleaned_data:
if local_form.cleaned_data['document_type_available_filenames']:
@@ -165,7 +171,9 @@ def upload_document_with_type(request, document_type_id, multiple=True):
document.update_mimetype()
document.update_page_count()
document.apply_default_transformations()
if AUTOMATIC_OCR:
document_queue = add_document_to_queue(instance)
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (instance, document_queue.label))
except Exception, e:
messages.error(request, e)
else:

View File

@@ -2,5 +2,5 @@ from django.utils.translation import ugettext_lazy as _
from common.api import register_menu
register_menu([
{'text':_(u'search'), 'view':'search', 'famfam':'zoom', 'position':5},
{'text':_(u'search'), 'view':'search', 'famfam':'zoom', 'position':2},
])

View File

@@ -35,6 +35,7 @@ def check_settings(request):
{'name':'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', 'value':documents_settings.FILESYSTEM_FILESERVING_PATH, 'exists':True},
{'name':'DOCUMENTS_SLUGIFY_PATHS', 'value':documents_settings.FILESYSTEM_SLUGIFY_PATHS},
{'name':'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 'value':documents_settings.FILESYSTEM_MAX_RENAME_COUNT},
{'name':'DOCUMENTS_AUTOMATIC_OCR', 'value':documents_settings.AUTOMATIC_OCR},
#Common
{'name':'COMMON_TEMPORARY_DIRECTORY', 'value':common_settings.TEMPORARY_DIRECTORY, 'exists':True},
@@ -45,6 +46,7 @@ def check_settings(request):
#ocr
{'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True},
{'name':'OCR_MAX_CONCURRENT_EXECUTION', 'value':ocr_settings.MAX_CONCURRENT_EXECUTION},
]
context={

View File

@@ -25,11 +25,10 @@ submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'a
register_links(Document, [submit_document], menu_name='sidebar')
#Menus
#register_menu([
# {'text':_('OCR'), 'view':'ocr_queue', 'links':[
# ocr_queue
# ],'famfam':'hourglass','position':5}])
register_menu([
{'text':_('OCR'), 'view':'queue_document_list', 'links':[
#ocr_queue
],'famfam':'hourglass','position':4}])
try:
default_queue, created = DocumentQueue.objects.get_or_create(name='default')

View File

@@ -15,11 +15,6 @@ from converter.api import convert_document_for_ocr
from ocr.conf.settings import TESSERACT_PATH
#from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
# QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING
#from models import DocumentQueue
def cleanup(filename):
''' tries to remove the given filename. Ignores non-existent files '''

View File

@@ -6,6 +6,13 @@ from documents.models import Document
from literals import DOCUMENTQUEUE_STATE_STOPPED,\
DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING,\
QUEUEDOCUMENT_STATE_CHOICES
def add_document_to_queue(document, queue_name='default'):
document_queue = DocumentQueue.objects.get(name=queue_name)
queue_document = QueueDocument(document_queue=document_queue, document=document)
queue_document.save()
return document_queue
class DocumentQueue(models.Model):
@@ -23,10 +30,9 @@ class DocumentQueue(models.Model):
def __unicode__(self):
return self.label
# def add_document(self, document):
# queue_document = QueueDocument(document_queue=self, document=document)
# queue_document.save()
# queue_dict[self.name].put(queue_document)
def add_document(self, document):
queue_document = QueueDocument(document_queue=self, document=document)
queue_document.save()
class QueueDocument(models.Model):
@@ -38,7 +44,6 @@ class QueueDocument(models.Model):
default=QUEUEDOCUMENT_STATE_PENDING,
verbose_name=_(u'state'))
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
pid = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'process id'))
class Meta:
verbose_name = _(u'queue document')

View File

@@ -3,4 +3,5 @@ from django.utils.translation import ugettext_lazy as _
urlpatterns = patterns('ocr.views',
url(r'^(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
url(r'^ocr/queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
)

View File

@@ -13,10 +13,36 @@ from documents.models import Document
from ocr import PERMISSION_OCR_DOCUMENT
from models import DocumentQueue, QueueDocument
from models import DocumentQueue, QueueDocument, add_document_to_queue
from tasks import do_document_ocr_task
def queue_document_list(request, queue_name='default'):
permissions = [PERMISSION_OCR_DOCUMENT]
try:
check_permissions(request.user, 'ocr', permissions)
except Unauthorized, e:
raise Http404(e)
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
return object_list(
request,
queryset=document_queue.queuedocument_set.all(),
template_name='generic_list.html',
extra_context={
'title':_(u'queued documents'),
'hide_object':True,
'extra_columns':[
{'name':'document', 'attribute': 'document'},
{'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0]},
{'name':'state', 'attribute': lambda x: x.get_state_display()},
],
},
)
def submit_document(request, document_id, queue_name='default'):
permissions = [PERMISSION_OCR_DOCUMENT]
try:
@@ -27,13 +53,7 @@ def submit_document(request, document_id, queue_name='default'):
document = get_object_or_404(Document, pk=document_id)
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
do_document_ocr_task.delay(document.id)
##document_queue.add_document(document)
#queue_document = QueueDocument(document_queue=document_queue, document=document)
#queue_document.save()
#add.delay(1,2)
add_document_to_queue(document, document_queue.name)
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (document, document_queue.label))
return HttpResponseRedirect(request.META['HTTP_REFERER'])

View File

@@ -75,3 +75,7 @@
* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
* Storage backend to storage backend copy support, to move/migrate document to new storage backend
* Tesserat default option ocr setup
* Do separate default transformations for staging and for local uploads
* Multiple ocr queue support
* Enable/disable ocr queue view & links
* Restrict view permission free form rename

View File

@@ -179,6 +179,8 @@ LOGIN_EXEMPT_URLS = (
#DOCUMENTS_STAGING_DIRECTORY = u'/tmp/mayan/staging'
#DOCUMENTS_DELETE_STAGING_FILE_AFTER_UPLOAD = False
#DOCUMENTS_STAGING_FILES_PREVIEW_SIZE = '640x480'
#DOCUMENTS_AUTOMATIC_OCR = False
# Saving
#DOCUMENTS_CHECKSUM_FUNCTION = lambda x: hashlib.sha256(x).hexdigest())