Added distributed OCR queue support
This commit is contained in:
@@ -32,6 +32,8 @@ Python:
|
||||
* Django - A high-level Python Web framework that encourages rapid development and clean, pragmatic design.
|
||||
* django-pagination
|
||||
* django-filetransfers - File upload/download abstraction
|
||||
* django-celery
|
||||
* celery
|
||||
|
||||
Or execute pip install -r requirements/production.txt to install the dependencies automatically.
|
||||
|
||||
@@ -42,6 +44,7 @@ Executables:
|
||||
* tesseract-ocr - An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google.
|
||||
* unpaper - post-processing scanned and photocopied book pages
|
||||
|
||||
|
||||
License
|
||||
-------
|
||||
See docs/LICENSE file
|
||||
|
||||
@@ -76,6 +76,6 @@ register_model_list_columns(Document, [
|
||||
register_menu([
|
||||
{'text':_('documents'), 'view':'document_create', 'links':[
|
||||
document_create, document_create_multiple, document_list
|
||||
],'famfam':'page','position':4}])
|
||||
],'famfam':'page','position':1}])
|
||||
|
||||
TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp()
|
||||
|
||||
@@ -34,6 +34,7 @@ STAGING_DIRECTORY = getattr(settings, 'DOCUMENTS_STAGING_DIRECTORY', u'/tmp/maya
|
||||
DELETE_STAGING_FILE_AFTER_UPLOAD = getattr(settings, 'DOCUMENTS_DELETE_STAGING_FILE_AFTER_UPLOAD', False)
|
||||
STAGING_FILES_PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_STAGING_FILES_PREVIEW_SIZE', '640x480')
|
||||
DELETE_LOCAL_ORIGINAL = getattr(settings, 'DOCUMENTS_DELETE_LOCAL_ORIGINAL', False)
|
||||
AUTOMATIC_OCR = getattr(settings, 'DOCUMENTS_AUTOMATIC_OCR', False)
|
||||
|
||||
# Saving
|
||||
CHECKSUM_FUNCTION = getattr(settings, 'DOCUMENTS_CHECKSUM_FUNCTION', lambda x: hashlib.sha256(x).hexdigest())
|
||||
|
||||
@@ -30,6 +30,8 @@ from forms import DocumentTypeSelectForm, DocumentCreateWizard, \
|
||||
|
||||
from staging import StagingFile
|
||||
|
||||
from ocr.models import add_document_to_queue
|
||||
|
||||
from documents.conf.settings import DELETE_STAGING_FILE_AFTER_UPLOAD
|
||||
from documents.conf.settings import USE_STAGING_DIRECTORY
|
||||
from documents.conf.settings import FILESYSTEM_FILESERVING_ENABLE
|
||||
@@ -39,6 +41,7 @@ from documents.conf.settings import THUMBNAIL_SIZE
|
||||
from documents.conf.settings import GROUP_MAX_RESULTS
|
||||
from documents.conf.settings import GROUP_SHOW_EMPTY
|
||||
from documents.conf.settings import DEFAULT_TRANSFORMATIONS
|
||||
from documents.conf.settings import AUTOMATIC_OCR
|
||||
|
||||
|
||||
from documents import PERMISSION_DOCUMENT_CREATE, \
|
||||
@@ -130,6 +133,9 @@ def upload_document_with_type(request, document_type_id, multiple=True):
|
||||
instance.update_mimetype()
|
||||
instance.update_page_count()
|
||||
instance.apply_default_transformations()
|
||||
if AUTOMATIC_OCR:
|
||||
document_queue = add_document_to_queue(instance)
|
||||
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (instance, document_queue.label))
|
||||
|
||||
if 'document_type_available_filenames' in local_form.cleaned_data:
|
||||
if local_form.cleaned_data['document_type_available_filenames']:
|
||||
@@ -165,7 +171,9 @@ def upload_document_with_type(request, document_type_id, multiple=True):
|
||||
document.update_mimetype()
|
||||
document.update_page_count()
|
||||
document.apply_default_transformations()
|
||||
|
||||
if AUTOMATIC_OCR:
|
||||
document_queue = add_document_to_queue(instance)
|
||||
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (instance, document_queue.label))
|
||||
except Exception, e:
|
||||
messages.error(request, e)
|
||||
else:
|
||||
|
||||
@@ -2,5 +2,5 @@ from django.utils.translation import ugettext_lazy as _
|
||||
from common.api import register_menu
|
||||
|
||||
register_menu([
|
||||
{'text':_(u'search'), 'view':'search', 'famfam':'zoom', 'position':5},
|
||||
{'text':_(u'search'), 'view':'search', 'famfam':'zoom', 'position':2},
|
||||
])
|
||||
|
||||
@@ -35,6 +35,7 @@ def check_settings(request):
|
||||
{'name':'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', 'value':documents_settings.FILESYSTEM_FILESERVING_PATH, 'exists':True},
|
||||
{'name':'DOCUMENTS_SLUGIFY_PATHS', 'value':documents_settings.FILESYSTEM_SLUGIFY_PATHS},
|
||||
{'name':'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 'value':documents_settings.FILESYSTEM_MAX_RENAME_COUNT},
|
||||
{'name':'DOCUMENTS_AUTOMATIC_OCR', 'value':documents_settings.AUTOMATIC_OCR},
|
||||
|
||||
#Common
|
||||
{'name':'COMMON_TEMPORARY_DIRECTORY', 'value':common_settings.TEMPORARY_DIRECTORY, 'exists':True},
|
||||
@@ -45,6 +46,7 @@ def check_settings(request):
|
||||
|
||||
#ocr
|
||||
{'name':'OCR_TESSERACT_PATH', 'value':ocr_settings.TESSERACT_PATH, 'exists':True},
|
||||
{'name':'OCR_MAX_CONCURRENT_EXECUTION', 'value':ocr_settings.MAX_CONCURRENT_EXECUTION},
|
||||
]
|
||||
|
||||
context={
|
||||
|
||||
@@ -25,11 +25,10 @@ submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'a
|
||||
register_links(Document, [submit_document], menu_name='sidebar')
|
||||
|
||||
#Menus
|
||||
#register_menu([
|
||||
# {'text':_('OCR'), 'view':'ocr_queue', 'links':[
|
||||
# ocr_queue
|
||||
# ],'famfam':'hourglass','position':5}])
|
||||
|
||||
register_menu([
|
||||
{'text':_('OCR'), 'view':'queue_document_list', 'links':[
|
||||
#ocr_queue
|
||||
],'famfam':'hourglass','position':4}])
|
||||
|
||||
try:
|
||||
default_queue, created = DocumentQueue.objects.get_or_create(name='default')
|
||||
|
||||
@@ -15,11 +15,6 @@ from converter.api import convert_document_for_ocr
|
||||
|
||||
from ocr.conf.settings import TESSERACT_PATH
|
||||
|
||||
#from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
|
||||
# QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING
|
||||
|
||||
#from models import DocumentQueue
|
||||
|
||||
|
||||
def cleanup(filename):
|
||||
''' tries to remove the given filename. Ignores non-existent files '''
|
||||
|
||||
@@ -6,6 +6,13 @@ from documents.models import Document
|
||||
from literals import DOCUMENTQUEUE_STATE_STOPPED,\
|
||||
DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING,\
|
||||
QUEUEDOCUMENT_STATE_CHOICES
|
||||
|
||||
|
||||
def add_document_to_queue(document, queue_name='default'):
|
||||
document_queue = DocumentQueue.objects.get(name=queue_name)
|
||||
queue_document = QueueDocument(document_queue=document_queue, document=document)
|
||||
queue_document.save()
|
||||
return document_queue
|
||||
|
||||
|
||||
class DocumentQueue(models.Model):
|
||||
@@ -23,10 +30,9 @@ class DocumentQueue(models.Model):
|
||||
def __unicode__(self):
|
||||
return self.label
|
||||
|
||||
# def add_document(self, document):
|
||||
# queue_document = QueueDocument(document_queue=self, document=document)
|
||||
# queue_document.save()
|
||||
# queue_dict[self.name].put(queue_document)
|
||||
def add_document(self, document):
|
||||
queue_document = QueueDocument(document_queue=self, document=document)
|
||||
queue_document.save()
|
||||
|
||||
|
||||
class QueueDocument(models.Model):
|
||||
@@ -38,7 +44,6 @@ class QueueDocument(models.Model):
|
||||
default=QUEUEDOCUMENT_STATE_PENDING,
|
||||
verbose_name=_(u'state'))
|
||||
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
|
||||
pid = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'process id'))
|
||||
|
||||
class Meta:
|
||||
verbose_name = _(u'queue document')
|
||||
|
||||
@@ -3,4 +3,5 @@ from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
urlpatterns = patterns('ocr.views',
|
||||
url(r'^(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
|
||||
url(r'^ocr/queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
|
||||
)
|
||||
|
||||
@@ -13,10 +13,36 @@ from documents.models import Document
|
||||
|
||||
from ocr import PERMISSION_OCR_DOCUMENT
|
||||
|
||||
from models import DocumentQueue, QueueDocument
|
||||
from models import DocumentQueue, QueueDocument, add_document_to_queue
|
||||
|
||||
from tasks import do_document_ocr_task
|
||||
|
||||
|
||||
def queue_document_list(request, queue_name='default'):
|
||||
permissions = [PERMISSION_OCR_DOCUMENT]
|
||||
try:
|
||||
check_permissions(request.user, 'ocr', permissions)
|
||||
except Unauthorized, e:
|
||||
raise Http404(e)
|
||||
|
||||
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
|
||||
|
||||
return object_list(
|
||||
request,
|
||||
queryset=document_queue.queuedocument_set.all(),
|
||||
template_name='generic_list.html',
|
||||
extra_context={
|
||||
'title':_(u'queued documents'),
|
||||
'hide_object':True,
|
||||
'extra_columns':[
|
||||
{'name':'document', 'attribute': 'document'},
|
||||
{'name':'submitted', 'attribute': lambda x: unicode(x.datetime_submitted).split('.')[0]},
|
||||
{'name':'state', 'attribute': lambda x: x.get_state_display()},
|
||||
],
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def submit_document(request, document_id, queue_name='default'):
|
||||
permissions = [PERMISSION_OCR_DOCUMENT]
|
||||
try:
|
||||
@@ -27,13 +53,7 @@ def submit_document(request, document_id, queue_name='default'):
|
||||
document = get_object_or_404(Document, pk=document_id)
|
||||
|
||||
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
|
||||
do_document_ocr_task.delay(document.id)
|
||||
##document_queue.add_document(document)
|
||||
#queue_document = QueueDocument(document_queue=document_queue, document=document)
|
||||
#queue_document.save()
|
||||
|
||||
|
||||
#add.delay(1,2)
|
||||
add_document_to_queue(document, document_queue.name)
|
||||
|
||||
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (document, document_queue.label))
|
||||
return HttpResponseRedirect(request.META['HTTP_REFERER'])
|
||||
|
||||
@@ -75,3 +75,7 @@
|
||||
* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
|
||||
* Storage backend to storage backend copy support, to move/migrate document to new storage backend
|
||||
* Tesserat default option ocr setup
|
||||
* Do separate default transformations for staging and for local uploads
|
||||
* Multiple ocr queue support
|
||||
* Enable/disable ocr queue view & links
|
||||
* Restrict view permission free form rename
|
||||
|
||||
@@ -179,6 +179,8 @@ LOGIN_EXEMPT_URLS = (
|
||||
#DOCUMENTS_STAGING_DIRECTORY = u'/tmp/mayan/staging'
|
||||
#DOCUMENTS_DELETE_STAGING_FILE_AFTER_UPLOAD = False
|
||||
#DOCUMENTS_STAGING_FILES_PREVIEW_SIZE = '640x480'
|
||||
#DOCUMENTS_AUTOMATIC_OCR = False
|
||||
|
||||
|
||||
# Saving
|
||||
#DOCUMENTS_CHECKSUM_FUNCTION = lambda x: hashlib.sha256(x).hexdigest())
|
||||
|
||||
Reference in New Issue
Block a user