Update the OCR app to work based on document versions not documents, document version are the module which hold the document pages instances. Remove old OCR document queue and replace with a single module for OCR processing error entries. Increase compatibility with Django 1.7 and Python 3.

This commit is contained in:
Roberto Rosario
2015-01-15 03:01:43 -04:00
parent 2371d3a49d
commit e6754c9a6f
24 changed files with 375 additions and 328 deletions

View File

@@ -1,61 +1,75 @@
from __future__ import absolute_import
from __future__ import unicode_literals
import logging
from django.dispatch import receiver
from django.utils.translation import ugettext_lazy as _
from south.signals import post_migrate
from acls.api import class_permissions
from common.utils import encapsulate
from documents.models import Document, DocumentVersion
from documents.signals import post_version_upload
from documents.widgets import document_link
from main.api import register_maintenance_links
from navigation.api import register_links
from navigation.api import register_links, register_model_list_columns
from navigation.links import link_spacer
from project_tools.api import register_tool
from rest_api.classes import APIEndPoint
from .links import (all_document_ocr_cleanup, ocr_tool_link,
queue_document_list, queue_document_multiple_delete,
re_queue_multiple_document, submit_document,
submit_document_multiple)
from .models import DocumentQueue
from .links import (
link_document_all_ocr_cleanup, link_document_submit,
link_document_submit_multiple, link_entry_delete,
link_entry_delete_multiple, link_entry_list, link_entry_re_queue,
link_entry_re_queue_multiple
)
from .models import DocumentVersionOCRError
from .permissions import PERMISSION_OCR_DOCUMENT
from .tasks import task_do_ocr
logger = logging.getLogger(__name__)
register_links(Document, [submit_document])
register_links([Document], [submit_document_multiple, link_spacer], menu_name='multi_item_links')
register_links(['ocr:queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete])
register_links(['ocr:queue_document_list'], [queue_document_list], menu_name='secondary_menu')
register_links(Document, [link_document_submit])
register_links([Document], [link_document_submit_multiple, link_spacer], menu_name='multi_item_links')
register_maintenance_links([all_document_ocr_cleanup], namespace='ocr', title=_(u'OCR'))
register_links([DocumentVersionOCRError], [link_entry_re_queue_multiple, link_entry_delete_multiple, link_spacer], menu_name='multi_item_links')
register_links([DocumentVersionOCRError], [link_entry_re_queue, link_entry_delete])
register_links(['ocr:entry_list', 'ocr:entry_delete_multiple', 'ocr:entry_re_queue_multiple', DocumentVersionOCRError], [link_entry_list], menu_name='secondary_menu')
register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR'))
def document_ocr_submit(self):
task_do_ocr.apply_async(args=[self.pk], queue='ocr')
def document_version_ocr_submit(self):
task_do_ocr.apply_async(args=[self.document.pk], queue='ocr')
@receiver(post_version_upload, dispatch_uid='post_version_upload_ocr', sender=DocumentVersion)
def post_version_upload_ocr(sender, instance, **kwargs):
logger.debug('received post_version_upload')
logger.debug('instance.document: %s', instance.document)
logger.debug('instance pk: %s', instance.pk)
if instance.document.document_type.ocr:
instance.document.submit_for_ocr()
@receiver(post_migrate, dispatch_uid='create_default_queue')
def create_default_queue_signal_handler(sender, **kwargs):
if kwargs['app'] == 'ocr':
DocumentQueue.objects.get_or_create(name='default')
instance.submit_for_ocr()
Document.add_to_class('submit_for_ocr', document_ocr_submit)
DocumentVersion.add_to_class('submit_for_ocr', document_version_ocr_submit)
class_permissions(Document, [PERMISSION_OCR_DOCUMENT])
register_tool(ocr_tool_link)
register_tool(link_entry_list)
APIEndPoint('ocr')
register_model_list_columns(DocumentVersionOCRError, [
{
'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document))
},
{
'name': _('Added'), 'attribute': 'datetime_submitted'
},
{
'name': _('Result'), 'attribute': 'result'
},
])

View File

@@ -1,20 +1,13 @@
from __future__ import absolute_import
from __future__ import unicode_literals
from django.contrib import admin
from .models import DocumentQueue, QueueDocument
from .models import DocumentVersionOCRError
class QueueDocumentInline(admin.StackedInline):
model = QueueDocument
extra = 1
classes = ('collapse-open',)
allow_add = True
class DocumentVersionOCRErrorAdmin(admin.ModelAdmin):
list_display = ('document_version', 'datetime_submitted')
readonly_fields = ('document_version', 'datetime_submitted', 'result')
class DocumentQueueAdmin(admin.ModelAdmin):
inlines = [QueueDocumentInline]
list_display = ('name', 'label')
admin.site.register(DocumentQueue, DocumentQueueAdmin)
admin.site.register(DocumentVersionOCRError, DocumentVersionOCRErrorAdmin)

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import unicode_literals
import logging
import os
@@ -30,14 +30,14 @@ except sh.CommandNotFound:
UNPAPER = None
def do_document_ocr(document):
def do_document_ocr(document_version):
"""
Try first to extract text from document pages using the registered
parser, if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling the corresponding
OCR backend
"""
for document_page in document.pages.all():
for document_page in document_version.pages.all():
try:
# Try to extract text by means of a parser
parse_document_page(document_page)
@@ -68,10 +68,10 @@ def do_document_ocr(document):
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
try:
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document.language)
ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language)
document_page.content = ocr_cleanup(document.language, ocr_text)
document_page.page_label = _(u'Text from OCR')
document_page.content = ocr_cleanup(document_version.document.language, ocr_text)
document_page.page_label = _('Text from OCR')
document_page.save()
finally:
fs_cleanup(pre_ocr_filepath_w_ext)
@@ -86,7 +86,7 @@ def ocr_cleanup(language, text):
cleanup filter
"""
try:
language_backend = load_backend(u'.'.join([u'ocr', u'lang', language, u'LanguageBackend']))()
language_backend = load_backend('.'.join(['ocr', 'lang', language, 'LanguageBackend']))()
except ImportError:
language_backend = None
@@ -104,9 +104,9 @@ def ocr_cleanup(language, text):
result = word
if result:
output.append(result)
output.append(u'\n')
output.append('\n')
return u' '.join(output)
return ' '.join(output)
def clean_pages():

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import absolute_import, unicode_literals
from django.core.exceptions import PermissionDenied
from django.shortcuts import get_object_or_404
@@ -8,33 +8,33 @@ from rest_framework.response import Response
from rest_framework.settings import api_settings
from acls.models import AccessEntry
from documents.models import Document
from documents.models import DocumentVersion
from permissions.models import Permission
from rest_api.permissions import MayanPermission
from .permissions import PERMISSION_OCR_DOCUMENT
from .serializers import DocumentOCRSerializer
from .serializers import DocumentVersionOCRSerializer
class DocumentOCRView(generics.GenericAPIView):
serializer_class = DocumentOCRSerializer
class DocumentVersionOCRView(generics.GenericAPIView):
serializer_class = DocumentVersionOCRSerializer
permission_classes = (MayanPermission,)
def post(self, request, *args, **kwargs):
"""Submit document OCR queue."""
"""Submit document version for OCR."""
serializer = self.get_serializer(data=request.DATA, files=request.FILES)
if serializer.is_valid():
document = get_object_or_404(Document, pk=serializer.data['document_id'])
document_version = get_object_or_404(DocumentVersion, pk=serializer.data['document_version_id'])
try:
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
except PermissionDenied:
AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document)
AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document_version.document)
document.submit_for_ocr()
document_version.submit_for_ocr()
headers = self.get_success_headers(serializer.data)
return Response(serializer.data, status=status.HTTP_202_ACCEPTED,

View File

@@ -1,3 +1,3 @@
class BackendBase(object):
def execute(self, input_filename, language=None): # NOQA
def execute(self, input_filename, language=None):
raise NotImplementedError

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import unicode_literals
import codecs
import errno
@@ -20,11 +20,11 @@ class Tesseract(BackendBase):
"""
fd, filepath = tempfile.mkstemp()
os.close(fd)
ocr_output = os.extsep.join([filepath, u'txt'])
ocr_output = os.extsep.join([filepath, 'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if language is not None:
command.extend([u'-l', language])
command.extend(['-l', language])
try:
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)

View File

@@ -1,3 +1,6 @@
from __future__ import unicode_literals
class OCRError(Exception):
"""
Raised by the OCR backend

View File

@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import re

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import unicode_literals
import re

View File

@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import re

View File

@@ -1,5 +1,5 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
from __future__ import unicode_literals
import re

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import unicode_literals
from django.utils.translation import ugettext_lazy as _
@@ -6,14 +6,13 @@ from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES,
PERMISSION_OCR_DOCUMENT,
PERMISSION_OCR_DOCUMENT_DELETE)
submit_document = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
submit_document_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document_multiple', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
re_queue_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
re_queue_multiple_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_multiple_document', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
queue_document_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
queue_document_multiple_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
link_document_submit = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
link_document_submit_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit_multiple', 'famfam': 'hourglass_add'}
link_entry_re_queue = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]}
link_entry_re_queue_multiple = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue_multiple', 'famfam': 'hourglass_add'}
link_entry_delete = {'text': _('Delete'), 'view': 'ocr:entry_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]}
link_entry_delete_multiple = {'text': _('Delete'), 'view': 'ocr:entry_delete_multiple', 'famfam': 'hourglass_delete'}
all_document_ocr_cleanup = {'text': _(u'Clean up pages content'), 'view': 'ocr:all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')}
link_document_all_ocr_cleanup = {'text': _('Clean up pages content'), 'view': 'ocr:document_all_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _('Runs a language filter to remove common OCR mistakes from document pages content.')}
queue_document_list = {'text': _(u'Queue document list'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]}
ocr_tool_link = {'text': _(u'OCR'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]}
link_entry_list = {'text': _('OCR Errors'), 'view': 'ocr:entry_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]}

View File

@@ -1,4 +1,6 @@
DEFAULT_OCR_FILE_FORMAT = u'tiff'
DEFAULT_OCR_FILE_EXTENSION = u'tif'
from __future__ import unicode_literals
DEFAULT_OCR_FILE_FORMAT = 'tiff'
DEFAULT_OCR_FILE_EXTENSION = 'tif'
LOCK_EXPIRE = 60 * 10 # Adjust to worst case scenario
UNPAPER_FILE_FORMAT = u'ppm'
UNPAPER_FILE_FORMAT = 'ppm'

View File

@@ -1,39 +1,22 @@
from __future__ import absolute_import
from __future__ import unicode_literals
from django.db import models
from django.core.exceptions import ObjectDoesNotExist
from django.utils.translation import ugettext
from django.utils.encoding import python_2_unicode_compatible
from django.utils.translation import ugettext_lazy as _
from documents.models import Document
from documents.models import DocumentVersion
class DocumentQueue(models.Model):
name = models.CharField(max_length=64, unique=True, verbose_name=_(u'Name'))
label = models.CharField(max_length=64, verbose_name=_(u'Label'))
@python_2_unicode_compatible
class DocumentVersionOCRError(models.Model):
document_version = models.ForeignKey(DocumentVersion, verbose_name=_('Document version'))
datetime_submitted = models.DateTimeField(verbose_name=_('Date time submitted'), auto_now=True, db_index=True)
result = models.TextField(blank=True, null=True, verbose_name=_('Result'))
class Meta:
verbose_name = _(u'Document queue')
verbose_name_plural = _(u'Document queues')
def __unicode__(self):
return self.label
class QueueDocument(models.Model):
document_queue = models.ForeignKey(DocumentQueue, related_name='documents', verbose_name=_(u'Document queue'))
document = models.ForeignKey(Document, verbose_name=_(u'Document'))
datetime_submitted = models.DateTimeField(verbose_name=_(u'Date time submitted'), auto_now=True, db_index=True)
result = models.TextField(blank=True, null=True, verbose_name=_(u'Result'))
node_name = models.CharField(max_length=256, verbose_name=_(u'Node name'), blank=True, null=True)
def __str__(self):
return unicode(self.document_version)
class Meta:
ordering = ('datetime_submitted',)
verbose_name = _(u'Queue document')
verbose_name_plural = _(u'Queue documents')
def __unicode__(self):
try:
return unicode(self.document)
except ObjectDoesNotExist:
return ugettext(u'Missing document.')
verbose_name = _('Document Version OCR Error')
verbose_name_plural = _('Document Version OCR Errors')

View File

@@ -1,3 +1,5 @@
from __future__ import unicode_literals
import logging
import os
import slate
@@ -90,7 +92,7 @@ class SlateParser(Parser):
raise ParserError
document_page.content = pdf_pages[document_page.page_number - 1]
document_page.page_label = _(u'Text extracted from PDF')
document_page.page_label = _('Text extracted from PDF')
document_page.save()
@@ -112,7 +114,7 @@ class OfficeParser(Parser):
# Now that the office document has been converted to PDF
# call the coresponding PDF parser in this new file
parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf')
parse_document_page(document_page, descriptor=open(input_filepath), mimetype='application/pdf')
else:
raise ParserError
@@ -126,7 +128,7 @@ class PopplerParser(Parser):
PDF parser using the pdftotext execute from the poppler package
"""
def __init__(self):
self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else u'/usr/bin/pdftotext'
self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else '/usr/bin/pdftotext'
if not os.path.exists(self.pdftotext_path):
raise ParserError('cannot find pdftotext executable')
logger.debug('self.pdftotext_path: %s', self.pdftotext_path)
@@ -167,9 +169,9 @@ class PopplerParser(Parser):
raise ParserError('No output')
document_page.content = output
document_page.page_label = _(u'Text extracted from PDF')
document_page.page_label = _('Text extracted from PDF')
document_page.save()
register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser])
register_parser(mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser])
register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser])

View File

@@ -1,10 +1,10 @@
from __future__ import absolute_import
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from permissions.models import Permission, PermissionNamespace
ocr_namespace = PermissionNamespace('ocr', _(u'OCR'))
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _(u'Submit documents for OCR'))
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _(u'Delete documents from OCR queue'))
PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _(u'Can execute the OCR clean up on all document pages'))
ocr_namespace = PermissionNamespace('ocr', _('OCR'))
PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR'))
PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue'))
PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages'))

View File

@@ -1,5 +1,3 @@
from __future__ import absolute_import
from common.utils import load_backend
from .settings import BACKEND

View File

@@ -1,7 +1,5 @@
from __future__ import absolute_import
from rest_framework import serializers
class DocumentOCRSerializer(serializers.Serializer):
document_id = serializers.IntegerField()
class DocumentVersionOCRSerializer(serializers.Serializer):
document_version_id = serializers.IntegerField()

View File

@@ -1,16 +1,16 @@
"""Configuration options for the ocr app"""
from __future__ import unicode_literals
from django.utils.translation import ugettext_lazy as _
from smart_settings.api import register_settings
register_settings(
namespace=u'ocr',
module=u'ocr.settings',
namespace='ocr',
module='ocr.settings',
settings=[
{'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True},
{'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True},
{'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
{'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')},
{'name': 'TESSERACT_PATH', 'global_name': 'OCR_TESSERACT_PATH', 'default': '/usr/bin/tesseract', 'exists': True},
{'name': 'UNPAPER_PATH', 'global_name': 'OCR_UNPAPER_PATH', 'default': '/usr/bin/unpaper', 'description': _('File path to unpaper program.'), 'exists': True},
{'name': 'PDFTOTEXT_PATH', 'global_name': 'OCR_PDFTOTEXT_PATH', 'default': '/usr/bin/pdftotext', 'description': _('File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True},
{'name': 'BACKEND', 'global_name': 'OCR_BACKEND', 'default': 'ocr.backends.tesseract.Tesseract', 'description': _('Full path to the backend to be used to do OCR.')},
]
)

View File

@@ -0,0 +1,88 @@
# -*- coding: utf-8 -*-
from south.utils import datetime_utils as datetime
from south.db import db
from south.v2 import SchemaMigration
from django.db import models
class Migration(SchemaMigration):
def forwards(self, orm):
# Deleting model 'DocumentQueue'
db.delete_table(u'ocr_documentqueue')
# Deleting model 'QueueDocument'
db.delete_table(u'ocr_queuedocument')
# Adding model 'DocumentVersionOCRError'
db.create_table(u'ocr_documentversionocrerror', (
(u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
('document_version', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.DocumentVersion'])),
('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, db_index=True, blank=True)),
('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)),
))
db.send_create_signal(u'ocr', ['DocumentVersionOCRError'])
def backwards(self, orm):
# Adding model 'DocumentQueue'
db.create_table(u'ocr_documentqueue', (
(u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
('name', self.gf('django.db.models.fields.CharField')(max_length=64, unique=True)),
('label', self.gf('django.db.models.fields.CharField')(max_length=64)),
))
db.send_create_signal(u'ocr', ['DocumentQueue'])
# Adding model 'QueueDocument'
db.create_table(u'ocr_queuedocument', (
('node_name', self.gf('django.db.models.fields.CharField')(max_length=256, null=True, blank=True)),
('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)),
('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, blank=True, db_index=True)),
('document_queue', self.gf('django.db.models.fields.related.ForeignKey')(related_name='documents', to=orm['ocr.DocumentQueue'])),
('document', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.Document'])),
(u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)),
))
db.send_create_signal(u'ocr', ['QueueDocument'])
# Deleting model 'DocumentVersionOCRError'
db.delete_table(u'ocr_documentversionocrerror')
models = {
u'documents.document': {
'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'},
'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}),
'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'label': ('django.db.models.fields.CharField', [], {'default': "u'Uninitialized document'", 'max_length': '255', 'db_index': 'True'}),
'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}),
'uuid': ('django.db.models.fields.CharField', [], {'default': "u'b5b498b5-ffe5-4b70-b8a6-6c875ed11bf2'", 'max_length': '48'})
},
u'documents.documenttype': {
'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'},
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}),
'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'})
},
u'documents.documentversion': {
'Meta': {'object_name': 'DocumentVersion'},
'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}),
'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}),
'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}),
'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}),
'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}),
'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'})
},
u'ocr.documentversionocrerror': {
'Meta': {'ordering': "('datetime_submitted',)", 'object_name': 'DocumentVersionOCRError'},
'datetime_submitted': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}),
'document_version': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentVersion']"}),
u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}),
'result': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'})
}
}
complete_apps = ['ocr']

View File

@@ -1,65 +1,61 @@
from __future__ import absolute_import
from __future__ import unicode_literals
import logging
import platform
import sys
import traceback
from django.conf import settings
from documents.models import Document
from documents.models import DocumentVersion
from lock_manager import Lock, LockError
from mayan.celery import app
from .api import do_document_ocr
from .literals import LOCK_EXPIRE
from .models import DocumentQueue, QueueDocument
from .models import DocumentVersionOCRError
logger = logging.getLogger(__name__)
@app.task(ignore_result=True)
def task_do_ocr(document_pk):
lock_id = u'task_do_ocr_doc-%d' % document_pk
def task_do_ocr(document_version_pk):
lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk
try:
logger.debug('trying to acquire lock: %s', lock_id)
# Acquire lock to avoid doing OCR on the same document more than once
# concurrently
# Acquire lock to avoid doing OCR on the same document version more than
# once concurrently
lock = Lock.acquire_lock(lock_id, LOCK_EXPIRE)
logger.debug('acquired lock: %s', lock_id)
document = None
document_version = None
try:
logger.info('Starting document OCR for document: %d', document_pk)
document = Document.objects.get(pk=document_pk)
do_document_ocr(document)
logger.info('Starting document OCR for document version: %d', document_version_pk)
document_version = DocumentVersion.objects.get(pk=document_version_pk)
do_document_ocr(document_version)
except Exception as exception:
logger.error('OCR error for document: %d; %s', document_pk, exception)
document_queue = DocumentQueue.objects.get(name='default')
if document:
queue_document, created = document_queue.documents.get_or_create(document=document)
queue_document.node_name = platform.node()
logger.error('OCR error for document version: %d; %s', document_version_pk, exception)
if document_version:
entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version)
if settings.DEBUG:
result = []
type, value, tb = sys.exc_info()
result.append('%s: %s' % (type.__name__, value))
result.extend(traceback.format_tb(tb))
queue_document.result = '\n'.join(result)
entry.result = '\n'.join(result)
else:
queue_document.result = exception
entry.result = exception
queue_document.save()
entry.save()
else:
logger.info('OCR for document: %d ended', document_pk)
document_queue = DocumentQueue.objects.get(name='default')
logger.info('OCR for document: %d ended', document_version_pk)
try:
queue_document = document_queue.documents.get(document=document)
except QueueDocument.DoesNotExist:
entry = DocumentVersionOCRError.objects.get(document_version=document_version)
except DocumentVersionOCRError.DoesNotExist:
pass
else:
queue_document.delete()
entry.delete()
finally:
lock.release()
except LockError:
logger.debug('unable to obtain lock')
logger.debug('unable to obtain lock: %s' % lock_id)
pass

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import unicode_literals
from django.core.files.base import File
from django.test import TransactionTestCase
@@ -6,8 +6,6 @@ from django.test import TransactionTestCase
from documents.models import Document, DocumentType
from documents.tests import TEST_SMALL_DOCUMENT_PATH, TEST_DOCUMENT_TYPE
from .models import DocumentQueue, QueueDocument
class DocumentOCRTestCase(TransactionTestCase):
def setUp(self):
@@ -16,11 +14,6 @@ class DocumentOCRTestCase(TransactionTestCase):
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document = Document.objects.new_document(file_object=File(file_object), document_type=self.document_type)[0].document
DocumentQueue.objects.get_or_create(name='default')
# Clear OCR queue
QueueDocument.objects.all().delete()
def _test_ocr_language_issue_16(self, language, result):
"""
Reusable OCR test for a specific language

View File

@@ -1,19 +1,21 @@
from __future__ import unicode_literals
from django.conf.urls import patterns, url
from .api_views import DocumentOCRView
from .api_views import DocumentVersionOCRView
urlpatterns = patterns('ocr.views',
url(r'^document/(?P<document_id>\d+)/submit/$', 'submit_document', (), 'submit_document'),
url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'),
url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'),
url(r'^queue/document/(?P<queue_document_id>\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'),
url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'),
url(r'^queue/document/(?P<queue_document_id>\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'),
url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'),
url(r'^document/(?P<pk>\d+)/submit/$', 'document_submit', (), 'document_submit'),
url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'),
url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'),
url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'),
url(r'^all/$', 'entry_list', (), 'entry_list'),
url(r'^(?P<pk>\d+)/delete/$', 'entry_delete', (), 'entry_delete'),
url(r'^multiple/delete/$', 'entry_delete_multiple', (), 'entry_delete_multiple'),
url(r'^(?P<pk>\d+)/re-queue/$', 'entry_re_queue', (), 'entry_re_queue'),
url(r'^multiple/re-queue/$', 'entry_re_queue_multiple', (), 'entry_re_queue_multiple'),
)
api_urls = patterns('',
url(r'^submit/$', DocumentOCRView.as_view(), name='document-ocr-submit-view'),
url(r'^submit/$', DocumentVersionOCRView.as_view(), name='document-version-ocr-submit-view'),
)

View File

@@ -1,4 +1,4 @@
from __future__ import absolute_import
from __future__ import absolute_import, unicode_literals
from django.contrib import messages
from django.core.exceptions import PermissionDenied
@@ -6,173 +6,43 @@ from django.core.urlresolvers import reverse
from django.http import HttpResponseRedirect
from django.shortcuts import get_object_or_404, render_to_response
from django.template import RequestContext
from django.utils.translation import ugettext_lazy as _
from django.utils.translation import ugettext_lazy as _, ungettext
from acls.models import AccessEntry
from common.utils import encapsulate
from documents.models import Document
from documents.widgets import document_link, document_thumbnail
from documents.models import Document, DocumentVersion
from permissions.models import Permission
from .api import clean_pages
from .models import DocumentQueue, QueueDocument
from .models import DocumentVersionOCRError
from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES,
PERMISSION_OCR_DOCUMENT,
PERMISSION_OCR_DOCUMENT_DELETE)
def queue_document_list(request, queue_name='default'):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
document_queue = get_object_or_404(DocumentQueue, name=queue_name)
context = {
'object_list': document_queue.documents.all(),
'title': _(u'Documents in queue: %s') % document_queue,
'hide_object': True,
'queue': document_queue,
'navigation_object_name': 'queue',
'list_object_variable_name': 'queue_document',
'extra_columns': [
{'name': _('Document'), 'attribute': encapsulate(lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.'))},
{'name': _(u'Thumbnail'), 'attribute': encapsulate(lambda x: document_thumbnail(x.document))},
{'name': _('Added'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True},
{'name': _('Node'), 'attribute': 'node_name'},
{'name': _('Result'), 'attribute': 'result'},
],
}
return render_to_response('main/generic_list.html', context,
context_instance=RequestContext(request))
def queue_document_delete(request, queue_document_id=None, queue_document_id_list=None):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE])
if queue_document_id:
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)]
elif queue_document_id_list:
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')]
else:
messages.error(request, _(u'Must provide at least one queue document.'))
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
if request.method == 'POST':
for queue_document in queue_documents:
try:
queue_document.delete()
messages.success(request, _(u'Queue document: %(document)s deleted successfully.') % {
'document': queue_document.document})
except Exception as exception:
messages.error(request, _(u'Error deleting document: %(document)s; %(error)s') % {
'document': queue_document, 'error': exception})
return HttpResponseRedirect(next)
context = {
'next': next,
'previous': previous,
'delete_view': True,
}
if len(queue_documents) == 1:
context['object'] = queue_documents[0]
context['title'] = _(u'Are you sure you wish to delete queue document: %s?') % ', '.join([unicode(d) for d in queue_documents])
elif len(queue_documents) > 1:
context['title'] = _(u'Are you sure you wish to delete queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents])
return render_to_response('main/generic_confirm.html', context,
context_instance=RequestContext(request))
def queue_document_multiple_delete(request):
return queue_document_delete(request, queue_document_id_list=request.GET.get('id_list', ''))
def submit_document_multiple(request):
for item_id in request.GET.get('id_list', '').split(','):
submit_document(request, item_id)
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
def submit_document(request, document_id):
document = get_object_or_404(Document, pk=document_id)
def document_submit(request, pk):
document = get_object_or_404(Document, pk=pk)
try:
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
except PermissionDenied:
AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document)
return submit_document_to_queue(request, document=document,
post_submit_redirect=request.META.get('HTTP_REFERER', reverse('main:home')))
def submit_document_to_queue(request, document, post_submit_redirect=None):
"""
This view is meant to be reusable
"""
document.submit_for_ocr()
messages.success(request, _(u'Document: %(document)s was added to the OCR queue.') % {
messages.success(request, _('Document: %(document)s was added to the OCR queue.') % {
'document': document}
)
if post_submit_redirect:
return HttpResponseRedirect(post_submit_redirect)
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
def re_queue_document(request, queue_document_id=None, queue_document_id_list=None):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
def document_submit_multiple(request):
for item_id in request.GET.get('id_list', '').split(','):
document_submit(request, item_id)
if queue_document_id:
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)]
elif queue_document_id_list:
queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')]
else:
messages.error(request, _(u'Must provide at least one queue document.'))
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
if request.method == 'POST':
for queue_document in queue_documents:
try:
queue_document.document.submit_for_ocr()
messages.success(
request,
_(u'Document: %(document)s was re-queued for OCR.') % {
'document': queue_document.document
}
)
except Document.DoesNotExist:
messages.error(request, _(u'Document id#: %d, no longer exists.') % queue_document.document_id)
return HttpResponseRedirect(next)
context = {
'next': next,
'previous': previous,
}
if len(queue_documents) == 1:
context['object'] = queue_documents[0]
context['title'] = _(u'Are you sure you wish to re-queue document: %s?') % ', '.join([unicode(d) for d in queue_documents])
elif len(queue_documents) > 1:
context['title'] = _(u'Are you sure you wish to re-queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents])
return render_to_response('main/generic_confirm.html', context,
context_instance=RequestContext(request))
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
def re_queue_multiple_document(request):
return re_queue_document(request, queue_document_id_list=request.GET.get('id_list', []))
def all_document_ocr_cleanup(request):
def document_all_ocr_cleanup(request):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES])
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
@@ -182,27 +52,133 @@ def all_document_ocr_cleanup(request):
return render_to_response('main/generic_confirm.html', {
'previous': previous,
'next': next,
'title': _(u'Are you sure you wish to clean up all the pages content?'),
'message': _(u'On large databases this operation may take some time to execute.'),
'title': _('Are you sure you wish to clean up all the pages content?'),
'message': _('On large databases this operation may take some time to execute.'),
}, context_instance=RequestContext(request))
else:
try:
# TODO: turn this into a Celery task
clean_pages()
messages.success(request, _(u'Document pages content clean up complete.'))
messages.success(request, _('Document pages content clean up complete.'))
except Exception as exception:
messages.error(request, _(u'Document pages content clean up error: %s') % exception)
messages.error(request, _('Document pages content clean up error: %s') % exception)
return HttpResponseRedirect(next)
def display_link(obj):
output = []
if hasattr(obj, 'get_absolute_url'):
output.append(u'<a href="%(url)s">%(obj)s</a>' % {
'url': obj.get_absolute_url(),
'obj': obj
})
if output:
return u''.join(output)
def entry_list(request):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
context = {
'object_list': DocumentVersionOCRError.objects.all(),
'title': _('OCR errors'),
'hide_object': True,
}
return render_to_response('main/generic_list.html', context,
context_instance=RequestContext(request))
def entry_delete(request, pk=None, pk_list=None):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE])
if pk:
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)]
elif pk_list:
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')]
else:
return obj
messages.error(request, _('Make at least one selection.'))
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
if request.method == 'POST':
for entry in entries:
try:
entry.delete()
messages.success(request, _('Entry: %(entry)s deleted successfully.') % {
'entry': entry})
except Exception as exception:
messages.error(request, _('Error entry: %(entry)s; %(error)s') % {
'entry': entry, 'error': exception})
return HttpResponseRedirect(next)
context = {
'next': next,
'previous': previous,
'delete_view': True,
}
if len(entries) == 1:
context['object'] = entries[0]
context['title'] = ungettext(
'Are you sure you wish to delete the entry: %(entry)s?',
'Are you sure you wish to delete these %(count)d entries.',
len(entries)
) % {
'count': len(entries),
'entry': entries[0],
}
return render_to_response('main/generic_confirm.html', context,
context_instance=RequestContext(request))
def entry_delete_multiple(request):
return entry_delete(request, pk_list=request.GET.get('id_list', ''))
def entry_re_queue(request, pk=None, pk_list=None):
Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT])
if pk:
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)]
elif pk_list:
entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')]
else:
messages.error(request, _('Make at least one selection.'))
return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home')))
next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None)))
previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None)))
if request.method == 'POST':
for entry in entries:
try:
entry.document_version.submit_for_ocr()
messages.success(
request,
_('Entry: %(entry)s was re-queued for OCR.') % {
'entry': entry
}
)
except DocumentVersion.DoesNotExist:
messages.error(request, _('Document version id#: %d, no longer exists.') % entry.document_version_id)
return HttpResponseRedirect(next)
context = {
'next': next,
'previous': previous,
}
if len(entries) == 1:
context['object'] = entries[0]
context['title'] = ungettext(
'Are you sure you wish to re-queue the entry: %(entry)s?',
'Are you sure you wish to re-queue these %(count)d entries.',
len(entries)
) % {
'count': len(entries),
'entry': entries[0],
}
return render_to_response('main/generic_confirm.html', context,
context_instance=RequestContext(request))
def entry_re_queue_multiple(request):
return entry_re_queue(request, pk_list=request.GET.get('id_list', []))