First commit to support ocr subprocess
This commit is contained in:
@@ -72,10 +72,10 @@ class ImageWidget(forms.widgets.Widget):
|
|||||||
|
|
||||||
output.append('<br /><span class="famfam active famfam-magnifier"></span>%s' % ugettext(u'Click on the image for full size view'))
|
output.append('<br /><span class="famfam active famfam-magnifier"></span>%s' % ugettext(u'Click on the image for full size view'))
|
||||||
|
|
||||||
#for document_page in value.documentpage_set.all():
|
for document_page in value.documentpage_set.all():
|
||||||
# output.append('<br/>%s)<a href="%s">%s</a>' % (document_page.page_number,
|
output.append('<br/>%s)<a href="%s">%s</a>' % (document_page.page_number,
|
||||||
# reverse('document_page_view', args=[document_page.id]),
|
reverse('document_page_view', args=[document_page.id]),
|
||||||
# ugettext(u'page view')))
|
ugettext(u'page view')))
|
||||||
#output.append(super(ImageWidget, self).render(name, value, attrs))
|
#output.append(super(ImageWidget, self).render(name, value, attrs))
|
||||||
return mark_safe(u''.join(output))
|
return mark_safe(u''.join(output))
|
||||||
|
|
||||||
|
|||||||
@@ -1,21 +1,50 @@
|
|||||||
|
from multiprocessing import Queue
|
||||||
|
|
||||||
from django.utils.translation import ugettext_lazy as _
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
from django.utils.translation import ugettext
|
||||||
|
from django.db.utils import DatabaseError
|
||||||
|
|
||||||
from common.api import register_links, register_menu
|
from common.api import register_links, register_menu
|
||||||
from permissions.api import register_permissions
|
from permissions.api import register_permissions
|
||||||
|
|
||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
|
||||||
PERMISSION_OCR_DOCUMENT = 'ocr_document'
|
from models import DocumentQueue
|
||||||
|
from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
|
||||||
|
DOCUMENTQUEUE_STATE_STOPPED, QUEUEDOCUMENT_STATE_PENDING
|
||||||
|
|
||||||
|
from api import start_queue_watcher
|
||||||
|
|
||||||
|
#Permissions
|
||||||
|
PERMISSION_OCR_DOCUMENT = 'ocr_document'
|
||||||
register_permissions('ocr', [
|
register_permissions('ocr', [
|
||||||
{'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
|
{'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
|
||||||
])
|
])
|
||||||
|
|
||||||
|
#Links
|
||||||
submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_DOCUMENT]}}
|
submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_DOCUMENT]}}
|
||||||
|
|
||||||
register_links(Document, [submit_document], menu_name='sidebar')
|
register_links(Document, [submit_document], menu_name='sidebar')
|
||||||
|
|
||||||
|
#Menus
|
||||||
#register_menu([
|
#register_menu([
|
||||||
# {'text':_('OCR'), 'view':'ocr_queue', 'links':[
|
# {'text':_('OCR'), 'view':'ocr_queue', 'links':[
|
||||||
# ocr_queue
|
# ocr_queue
|
||||||
# ],'famfam':'hourglass','position':5}])
|
# ],'famfam':'hourglass','position':5}])
|
||||||
|
|
||||||
|
|
||||||
|
try:
|
||||||
|
default_queue, created = DocumentQueue.objects.get_or_create(name='default')
|
||||||
|
if created:
|
||||||
|
default_queue.label = ugettext(u'Default')
|
||||||
|
default_queue.save()
|
||||||
|
|
||||||
|
for queue in DocumentQueue.objects.all():
|
||||||
|
queue.state = DOCUMENTQUEUE_STATE_STOPPED
|
||||||
|
queue.save()
|
||||||
|
start_queue_watcher(queue.name)
|
||||||
|
for document in queue.queuedocument_set.filter(state=QUEUEDOCUMENT_STATE_PROCESSING):
|
||||||
|
document.state = QUEUEDOCUMENT_STATE_PENDING
|
||||||
|
document.save()
|
||||||
|
except DatabaseError:
|
||||||
|
#syncdb
|
||||||
|
pass
|
||||||
|
|||||||
19
apps/ocr/admin.py
Normal file
19
apps/ocr/admin.py
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
from django.contrib import admin
|
||||||
|
|
||||||
|
from models import DocumentQueue, QueueDocument
|
||||||
|
|
||||||
|
|
||||||
|
class QueueDocumentInline(admin.StackedInline):
|
||||||
|
model = QueueDocument
|
||||||
|
extra = 1
|
||||||
|
classes = ('collapse-open',)
|
||||||
|
allow_add = True
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentQueueAdmin(admin.ModelAdmin):
|
||||||
|
inlines = [QueueDocumentInline]
|
||||||
|
list_directory = ('name', 'label', 'state')
|
||||||
|
|
||||||
|
|
||||||
|
admin.site.register(DocumentQueue, DocumentQueueAdmin)
|
||||||
|
|
||||||
@@ -1,18 +1,29 @@
|
|||||||
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
|
#Some code from http://wiki.github.com/hoffstaetter/python-tesseract
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from multiprocessing import Process, Queue
|
||||||
|
from Queue import Empty
|
||||||
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
|
|
||||||
from django.utils.translation import ugettext as _
|
from django.utils.translation import ugettext as _
|
||||||
|
from django.contrib import messages
|
||||||
|
|
||||||
from documents.models import DocumentPage
|
|
||||||
from common.conf.settings import TEMPORARY_DIRECTORY
|
from common.conf.settings import TEMPORARY_DIRECTORY
|
||||||
|
|
||||||
|
from documents.models import Document
|
||||||
|
|
||||||
from converter.api import convert_document_for_ocr
|
from converter.api import convert_document_for_ocr
|
||||||
|
|
||||||
from ocr.conf.settings import TESSERACT_PATH
|
from ocr.conf.settings import TESSERACT_PATH
|
||||||
|
|
||||||
|
from literals import QUEUEDOCUMENT_STATE_PROCESSING, \
|
||||||
|
QUEUEDOCUMENT_STATE_ERROR, QUEUEDOCUMENT_STATE_PENDING
|
||||||
|
|
||||||
|
from models import DocumentQueue
|
||||||
|
|
||||||
|
queue_dict = {}
|
||||||
|
|
||||||
def cleanup(filename):
|
def cleanup(filename):
|
||||||
''' tries to remove the given filename. Ignores non-existent files '''
|
''' tries to remove the given filename. Ignores non-existent files '''
|
||||||
@@ -46,7 +57,9 @@ def run_tesseract(input_filename, output_filename_base, lang=None):
|
|||||||
return (proc.wait(), proc.stderr.read())
|
return (proc.wait(), proc.stderr.read())
|
||||||
|
|
||||||
|
|
||||||
def ocr_document(document):
|
#def do_document_ocr(document):
|
||||||
|
|
||||||
|
def do_document_ocr(document):
|
||||||
for page_index, document_page in enumerate(document.documentpage_set.all()):
|
for page_index, document_page in enumerate(document.documentpage_set.all()):
|
||||||
imagefile = convert_document_for_ocr(document, page=page_index)
|
imagefile = convert_document_for_ocr(document, page=page_index)
|
||||||
desc, filepath = tempfile.mkstemp()
|
desc, filepath = tempfile.mkstemp()
|
||||||
@@ -60,8 +73,9 @@ def ocr_document(document):
|
|||||||
|
|
||||||
f = file(ocr_output)
|
f = file(ocr_output)
|
||||||
try:
|
try:
|
||||||
document_page, created = DocumentPage.objects.get_or_create(document=document,
|
#document_page, created = DocumentPage.objects.get_or_create(document=document,
|
||||||
page_number=page_index+1)
|
# page_number=page_index+1)
|
||||||
|
document_page = document.documentpage_set.get(page_number=page_index+1)
|
||||||
document_page.content = f.read().strip()
|
document_page.content = f.read().strip()
|
||||||
document_page.page_label = _(u'Text from OCR')
|
document_page.page_label = _(u'Text from OCR')
|
||||||
document_page.save()
|
document_page.save()
|
||||||
@@ -70,3 +84,63 @@ def ocr_document(document):
|
|||||||
cleanup(filepath)
|
cleanup(filepath)
|
||||||
cleanup(ocr_output)
|
cleanup(ocr_output)
|
||||||
cleanup(imagefile)
|
cleanup(imagefile)
|
||||||
|
|
||||||
|
|
||||||
|
def do_queue_document(queue_document):
|
||||||
|
print 'do_queue_document'
|
||||||
|
queue_document.state = QUEUEDOCUMENT_STATE_PROCESSING
|
||||||
|
queue_document.save()
|
||||||
|
|
||||||
|
try:
|
||||||
|
do_document_ocr(queue_document.document)
|
||||||
|
queue_document.delete()
|
||||||
|
print 'ocr ended ok'
|
||||||
|
|
||||||
|
except Exception, e:
|
||||||
|
print 'error', e
|
||||||
|
queue_document.state = QUEUEDOCUMENT_STATE_ERROR
|
||||||
|
queue_document.result = e
|
||||||
|
queue_document.save()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def process_queue_document(queue_document):
|
||||||
|
#print 'process_queued_document'
|
||||||
|
#print 'test' ,queue_document.document.documentpage_set.all()
|
||||||
|
#print 'after'
|
||||||
|
d=Document.objects.get(id=42)
|
||||||
|
print d
|
||||||
|
print d.documentpage_set.all()
|
||||||
|
print 'after'
|
||||||
|
|
||||||
|
p = Process(target=do_queue_document, args=(queue_document,))
|
||||||
|
p.start()
|
||||||
|
|
||||||
|
|
||||||
|
def start_queue_watcher(queue_name):
|
||||||
|
|
||||||
|
if queue_name in queue_dict:
|
||||||
|
print 'already started'
|
||||||
|
else:
|
||||||
|
queue_dict[queue_name] = Queue()
|
||||||
|
print 'start', queue_name
|
||||||
|
# if queue_name in queue_dict:
|
||||||
|
document_queue = DocumentQueue.objects.get(name=queue_name)
|
||||||
|
watcher = Process(target=queue_watcher, args=(document_queue,))
|
||||||
|
watcher.start()
|
||||||
|
# else:
|
||||||
|
# raise Exception('No such queue: %s' % queue_name)
|
||||||
|
|
||||||
|
import time
|
||||||
|
import sys
|
||||||
|
def queue_watcher(document_queue):
|
||||||
|
while True:
|
||||||
|
time.sleep(5)
|
||||||
|
try:
|
||||||
|
oldest_queued_document = document_queue.queuedocument_set.filter(
|
||||||
|
state=QUEUEDOCUMENT_STATE_PENDING).order_by('datetime_submitted')[0]
|
||||||
|
process_queue_document(oldest_queued_document)
|
||||||
|
print 'queue.get', oldest_queued_document
|
||||||
|
sys.stdout.flush()
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|||||||
21
apps/ocr/literals.py
Normal file
21
apps/ocr/literals.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
|
|
||||||
|
DOCUMENTQUEUE_STATE_STOPPED = 's'
|
||||||
|
DOCUMENTQUEUE_STATE_ACTIVE = 'a'
|
||||||
|
|
||||||
|
DOCUMENTQUEUE_STATE_CHOICES = (
|
||||||
|
(DOCUMENTQUEUE_STATE_STOPPED, _(u'stopped')),
|
||||||
|
(DOCUMENTQUEUE_STATE_ACTIVE, _(u'active')),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
QUEUEDOCUMENT_STATE_PENDING = 'p'
|
||||||
|
QUEUEDOCUMENT_STATE_PROCESSING = 'i'
|
||||||
|
QUEUEDOCUMENT_STATE_ERROR = 'e'
|
||||||
|
|
||||||
|
QUEUEDOCUMENT_STATE_CHOICES = (
|
||||||
|
(QUEUEDOCUMENT_STATE_PENDING, _(u'pending')),
|
||||||
|
(QUEUEDOCUMENT_STATE_PROCESSING, _(u'processing')),
|
||||||
|
(QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
|
||||||
|
)
|
||||||
@@ -1,3 +1,51 @@
|
|||||||
from django.db import models
|
from django.db import models
|
||||||
|
from django.utils.translation import ugettext_lazy as _
|
||||||
|
|
||||||
# Create your models here.
|
from documents.models import Document
|
||||||
|
|
||||||
|
from literals import DOCUMENTQUEUE_STATE_STOPPED,\
|
||||||
|
DOCUMENTQUEUE_STATE_CHOICES, QUEUEDOCUMENT_STATE_PENDING,\
|
||||||
|
QUEUEDOCUMENT_STATE_CHOICES
|
||||||
|
|
||||||
|
|
||||||
|
#from api import queue_dict
|
||||||
|
|
||||||
|
|
||||||
|
class DocumentQueue(models.Model):
|
||||||
|
name = models.CharField(max_length=64, unique=True, verbose_name=_(u'name'))
|
||||||
|
label = models.CharField(max_length=64, verbose_name=_(u'label'))
|
||||||
|
state = models.CharField(max_length=4,
|
||||||
|
choices=DOCUMENTQUEUE_STATE_CHOICES,
|
||||||
|
default=DOCUMENTQUEUE_STATE_STOPPED,
|
||||||
|
verbose_name=_(u'state'))
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = _(u'document queue')
|
||||||
|
verbose_name_plural = _(u'document queues')
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return self.label
|
||||||
|
|
||||||
|
# def add_document(self, document):
|
||||||
|
# queue_document = QueueDocument(document_queue=self, document=document)
|
||||||
|
# queue_document.save()
|
||||||
|
# queue_dict[self.name].put(queue_document)
|
||||||
|
|
||||||
|
|
||||||
|
class QueueDocument(models.Model):
|
||||||
|
document_queue = models.ForeignKey(DocumentQueue, verbose_name=_(u'document queue'))
|
||||||
|
document = models.ForeignKey(Document, verbose_name=_(u'document'))
|
||||||
|
datetime_submitted = models.DateTimeField(verbose_name=_(u'date time submitted'), auto_now_add=True)
|
||||||
|
state = models.CharField(max_length=4,
|
||||||
|
choices=QUEUEDOCUMENT_STATE_CHOICES,
|
||||||
|
default=QUEUEDOCUMENT_STATE_PENDING,
|
||||||
|
verbose_name=_(u'state'))
|
||||||
|
result = models.TextField(blank=True, null=True, verbose_name=_(u'result'))
|
||||||
|
pid = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'process id'))
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
verbose_name = _(u'queue document')
|
||||||
|
verbose_name_plural = _(u'queue documents')
|
||||||
|
|
||||||
|
def __unicode__(self):
|
||||||
|
return unicode(self.document)
|
||||||
|
|||||||
@@ -12,9 +12,11 @@ from permissions.api import check_permissions, Unauthorized
|
|||||||
from documents.models import Document
|
from documents.models import Document
|
||||||
|
|
||||||
from ocr import PERMISSION_OCR_DOCUMENT
|
from ocr import PERMISSION_OCR_DOCUMENT
|
||||||
from api import ocr_document
|
|
||||||
|
|
||||||
def submit_document(request, document_id):
|
from models import DocumentQueue, QueueDocument
|
||||||
|
|
||||||
|
|
||||||
|
def submit_document(request, document_id, queue_name='default'):
|
||||||
permissions = [PERMISSION_OCR_DOCUMENT]
|
permissions = [PERMISSION_OCR_DOCUMENT]
|
||||||
try:
|
try:
|
||||||
check_permissions(request.user, 'ocr', permissions)
|
check_permissions(request.user, 'ocr', permissions)
|
||||||
@@ -23,11 +25,10 @@ def submit_document(request, document_id):
|
|||||||
|
|
||||||
document = get_object_or_404(Document, pk=document_id)
|
document = get_object_or_404(Document, pk=document_id)
|
||||||
|
|
||||||
try:
|
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
|
||||||
result = ocr_document(document)
|
#document_queue.add_document(document)
|
||||||
except Exception, e:
|
queue_document = QueueDocument(document_queue=document_queue, document=document)
|
||||||
messages.error(request, e)
|
queue_document.save()
|
||||||
return HttpResponseRedirect(request.META['HTTP_REFERER'])
|
|
||||||
|
|
||||||
messages.success(request, _(u'Document OCR was successful.'))
|
messages.success(request, _(u'Document: %s was added to the OCR queue: %s.') % (document, document_queue.label))
|
||||||
return HttpResponseRedirect(request.META['HTTP_REFERER'])
|
return HttpResponseRedirect(request.META['HTTP_REFERER'])
|
||||||
|
|||||||
@@ -75,3 +75,4 @@
|
|||||||
* Don't append an extension separator if extension is non existant
|
* Don't append an extension separator if extension is non existant
|
||||||
* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
|
* Don't do OCR on wordproccessing or spreadsheet document, strip tags and store text
|
||||||
* Storage backend to storage backend copy support, to move/migrate document to new storage backend
|
* Storage backend to storage backend copy support, to move/migrate document to new storage backend
|
||||||
|
* Tesserat default option ocr setup
|
||||||
|
|||||||
Reference in New Issue
Block a user