From 683fa5b531bf2ffcd18082b872103f8c3baf4424 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 14 Jan 2015 17:25:31 -0400 Subject: [PATCH 01/20] Make top level URLs shorter or more explicit --- mayan/urls.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mayan/urls.py b/mayan/urls.py index a12b13e939..12265641df 100644 --- a/mayan/urls.py +++ b/mayan/urls.py @@ -5,36 +5,36 @@ from django.contrib import admin admin.autodiscover() urlpatterns = patterns('', - url(r'^', include('common.urls', namespace='common')), url(r'^', include('main.urls', namespace='main')), + url(r'^accounts/', include('user_management.urls', namespace='user_management')), url(r'^acls/', include('acls.urls', namespace='acls')), url(r'^admin/', include(admin.site.urls)), url(r'^api/', include('rest_api.urls')), url(r'^checkouts/', include('checkouts.urls', namespace='checkouts')), url(r'^comments/', include('document_comments.urls', namespace='comments')), - url(r'^document_acls/', include('document_acls.urls', namespace='document_acls')), - url(r'^document_indexing/', include('document_indexing.urls', namespace='indexing')), + url(r'^common/', include('common.urls', namespace='common')), + url(r'^document/acls/', include('document_acls.urls', namespace='document_acls')), + url(r'^document/signatures/', include('document_signatures.urls', namespace='signatures')), url(r'^documents/', include('documents.urls', namespace='documents')), - url(r'^documents/signatures/', include('document_signatures.urls', namespace='signatures')), url(r'^docs/', include('rest_framework_swagger.urls')), url(r'^events/', include('events.urls', namespace='events')), url(r'^folders/', include('folders.urls', namespace='folders')), url(r'^gpg/', include('django_gpg.urls', namespace='django_gpg')), + url(r'^indexing/', include('document_indexing.urls', namespace='indexing')), url(r'^installation/', include('installation.urls', namespace='installation')), url(r'^linking/', include('linking.urls', namespace='linking')), url(r'^mailer/', include('mailer.urls', namespace='mailer')), url(r'^metadata/', include('metadata.urls', namespace='metadata')), url(r'^ocr/', include('ocr.urls', namespace='ocr')), url(r'^permissions/', include('permissions.urls', namespace='permissions')), - url(r'^project_setup/', include('project_setup.urls', namespace='project_setup')), - url(r'^project_tools/', include('project_tools.urls', namespace='project_tools')), url(r'^registration/', include('registration.urls', namespace='registration')), url(r'^search/', include('dynamic_search.urls', namespace='search')), url(r'^settings/', include('smart_settings.urls', namespace='settings')), + url(r'^setup/', include('project_setup.urls', namespace='project_setup')), url(r'^sources/', include('sources.urls', namespace='sources')), url(r'^statistics/', include('statistics.urls', namespace='statistics')), url(r'^tags/', include('tags.urls', namespace='tags')), - url(r'^user_management/', include('user_management.urls', namespace='user_management')), + url(r'^tools/', include('project_tools.urls', namespace='project_tools')), ) From 4f94bf0dfc91ff7c45a7486fc915fc61862b3ba7 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 14 Jan 2015 17:32:14 -0400 Subject: [PATCH 02/20] Remove vestigial '_orphan_document_' document type if there are no documents of that type --- .../0031_remove_orphan_documents.py | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 mayan/apps/documents/south_migrations/0031_remove_orphan_documents.py diff --git a/mayan/apps/documents/south_migrations/0031_remove_orphan_documents.py b/mayan/apps/documents/south_migrations/0031_remove_orphan_documents.py new file mode 100644 index 0000000000..8b38a3bbf4 --- /dev/null +++ b/mayan/apps/documents/south_migrations/0031_remove_orphan_documents.py @@ -0,0 +1,123 @@ +# -*- coding: utf-8 -*- +from south.utils import datetime_utils as datetime +from south.db import db +from south.v2 import DataMigration +from django.db import models + + +class Migration(DataMigration): + + def forwards(self, orm): + "Write your forwards methods here." + # Note: Don't use "from appname.models import ModelName". + # Use orm.ModelName to refer to models in this application, + # and orm['appname.ModelName'] for models in other applications. + try: + orphan_document_type = orm.DocumentType.objects.get(name='_orphan_document_') + except: + pass + else: + if not orphan_document_type.documents.count(): + orphan_document_type.delete() + + def backwards(self, orm): + "Write your backwards methods here." + + models = { + u'auth.group': { + 'Meta': {'object_name': 'Group'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '80'}), + 'permissions': ('django.db.models.fields.related.ManyToManyField', [], {'to': u"orm['auth.Permission']", 'symmetrical': 'False', 'blank': 'True'}) + }, + u'auth.permission': { + 'Meta': {'ordering': "(u'content_type__app_label', u'content_type__model', u'codename')", 'unique_together': "((u'content_type', u'codename'),)", 'object_name': 'Permission'}, + 'codename': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'content_type': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['contenttypes.ContentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '50'}) + }, + u'auth.user': { + 'Meta': {'object_name': 'User'}, + 'date_joined': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'email': ('django.db.models.fields.EmailField', [], {'max_length': '75', 'blank': 'True'}), + 'first_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'groups': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Group']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'is_active': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'is_staff': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'is_superuser': ('django.db.models.fields.BooleanField', [], {'default': 'False'}), + 'last_login': ('django.db.models.fields.DateTimeField', [], {'default': 'datetime.datetime.now'}), + 'last_name': ('django.db.models.fields.CharField', [], {'max_length': '30', 'blank': 'True'}), + 'password': ('django.db.models.fields.CharField', [], {'max_length': '128'}), + 'user_permissions': ('django.db.models.fields.related.ManyToManyField', [], {'symmetrical': 'False', 'related_name': "u'user_set'", 'blank': 'True', 'to': u"orm['auth.Permission']"}), + 'username': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '30'}) + }, + u'contenttypes.contenttype': { + 'Meta': {'ordering': "('name',)", 'unique_together': "(('app_label', 'model'),)", 'object_name': 'ContentType', 'db_table': "'django_content_type'"}, + 'app_label': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'model': ('django.db.models.fields.CharField', [], {'max_length': '100'}), + 'name': ('django.db.models.fields.CharField', [], {'max_length': '100'}) + }, + u'documents.document': { + 'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'}, + 'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'label': ('django.db.models.fields.CharField', [], {'default': "u'Uninitialized document'", 'max_length': '255', 'db_index': 'True'}), + 'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}), + 'uuid': ('django.db.models.fields.CharField', [], {'default': "u'26db4eb3-1050-4d26-8324-74b09d61991f'", 'max_length': '48'}) + }, + u'documents.documentpage': { + 'Meta': {'ordering': "['page_number']", 'object_name': 'DocumentPage'}, + 'content': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_version': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'pages'", 'to': u"orm['documents.DocumentVersion']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'page_label': ('django.db.models.fields.CharField', [], {'max_length': '40', 'null': 'True', 'blank': 'True'}), + 'page_number': ('django.db.models.fields.PositiveIntegerField', [], {'default': '1', 'db_index': 'True'}) + }, + u'documents.documentpagetransformation': { + 'Meta': {'ordering': "('order',)", 'object_name': 'DocumentPageTransformation'}, + 'arguments': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_page': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentPage']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'order': ('django.db.models.fields.PositiveIntegerField', [], {'default': '0', 'null': 'True', 'db_index': 'True', 'blank': 'True'}), + 'transformation': ('django.db.models.fields.CharField', [], {'max_length': '128'}) + }, + u'documents.documenttype': { + 'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}), + 'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'}) + }, + u'documents.documenttypefilename': { + 'Meta': {'ordering': "['filename']", 'unique_together': "(('document_type', 'filename'),)", 'object_name': 'DocumentTypeFilename'}, + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'filenames'", 'to': u"orm['documents.DocumentType']"}), + 'enabled': ('django.db.models.fields.BooleanField', [], {'default': 'True'}), + 'filename': ('django.db.models.fields.CharField', [], {'max_length': '128', 'db_index': 'True'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}) + }, + u'documents.documentversion': { + 'Meta': {'object_name': 'DocumentVersion'}, + 'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}), + 'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}), + 'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}) + }, + u'documents.recentdocument': { + 'Meta': {'ordering': "('-datetime_accessed',)", 'object_name': 'RecentDocument'}, + 'datetime_accessed': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.Document']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'user': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['auth.User']"}) + } + } + + complete_apps = ['documents'] + symmetrical = True From 6e333b7eeb60853a0a3ab79a92cd8616c7bcbbd7 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 14 Jan 2015 17:49:59 -0400 Subject: [PATCH 03/20] The documents app should not have any knowledge of the sources app, move sources link registration to the sources app --- mayan/apps/documents/__init__.py | 6 ++---- mayan/apps/sources/__init__.py | 23 ++++++++++------------- 2 files changed, 12 insertions(+), 17 deletions(-) diff --git a/mayan/apps/documents/__init__.py b/mayan/apps/documents/__init__.py index 1314fd3ce3..b5fd951844 100644 --- a/mayan/apps/documents/__init__.py +++ b/mayan/apps/documents/__init__.py @@ -77,10 +77,8 @@ register_links(Document, [document_events_view, document_version_list], menu_nam # Document Version links register_links(DocumentVersion, [document_version_revert, document_version_download]) -secondary_menu_links = [document_list_recent, document_list] -# TODO: register this at sources app too -register_links(['documents:document_list_recent', 'documents:document_list', 'sources:document_create', 'sources:document_create_multiple', 'sources:upload_interactive', 'sources:staging_file_delete'], secondary_menu_links, menu_name='secondary_menu') -register_links(Document, secondary_menu_links, menu_name='secondary_menu') +register_links(['documents:document_list_recent', 'documents:document_list'], [document_list_recent, document_list], menu_name='secondary_menu') +register_links(Document, [document_list_recent, document_list], menu_name='secondary_menu') # Document page links register_links(DocumentPage, [ diff --git a/mayan/apps/sources/__init__.py b/mayan/apps/sources/__init__.py index 054ea0eebd..e02a186025 100644 --- a/mayan/apps/sources/__init__.py +++ b/mayan/apps/sources/__init__.py @@ -3,6 +3,7 @@ from __future__ import absolute_import from django.utils.translation import ugettext_lazy as _ from common.utils import encapsulate +from documents.links import document_list_recent, document_list from documents.models import Document from navigation.api import register_links, register_model_list_columns from project_setup.api import register_setup @@ -24,17 +25,6 @@ from .links import (document_create_multiple, document_create_siblings, from .models import Source, SourceTransformation from .widgets import staging_file_thumbnail -register_links([StagingFile], [staging_file_delete]) - -register_links([Source, 'sources:setup_source_list', 'sources:setup_source_create'], [setup_sources, setup_source_create_webform, setup_source_create_staging_folder, setup_source_create_pop3_email, setup_source_create_imap_email, setup_source_create_watch_folder], menu_name='secondary_menu') -register_links([Source], [setup_source_edit, setup_source_transformation_list, setup_source_delete]) - -register_links(SourceTransformation, [setup_source_transformation_edit, setup_source_transformation_delete]) -register_links(['sources:setup_source_transformation_create', 'sources:setup_source_transformation_edit', 'sources:setup_source_transformation_delete', 'sources:setup_source_transformation_list'], [setup_source_transformation_create], menu_name='sidebar') - -# Document version -register_links(['documents:document_version_list', 'documents:upload_version', 'documents:document_version_revert'], [upload_version], menu_name='sidebar') - register_model_list_columns(StagingFile, [ { 'name': _(u'Thumbnail'), 'attribute': @@ -42,9 +32,16 @@ register_model_list_columns(StagingFile, [ }, ]) -register_setup(setup_sources) - +register_links([StagingFile], [staging_file_delete]) +register_links([Source, 'sources:setup_source_list', 'sources:setup_source_create'], [setup_sources, setup_source_create_webform, setup_source_create_staging_folder, setup_source_create_pop3_email, setup_source_create_imap_email, setup_source_create_watch_folder], menu_name='secondary_menu') +register_links([Source], [setup_source_edit, setup_source_transformation_list, setup_source_delete]) +register_links(SourceTransformation, [setup_source_transformation_edit, setup_source_transformation_delete]) +register_links(['sources:setup_source_transformation_create', 'sources:setup_source_transformation_edit', 'sources:setup_source_transformation_delete', 'sources:setup_source_transformation_list'], [setup_source_transformation_create], menu_name='sidebar') +register_links(['documents:document_version_list', 'documents:upload_version', 'documents:document_version_revert'], [upload_version], menu_name='sidebar') register_links([Document, 'documents:document_list_recent', 'documents:document_list', 'sources:document_create', 'sources:document_create_multiple', 'sources:upload_interactive', 'sources:staging_file_delete'], [document_create_multiple], menu_name='secondary_menu') register_links(Document, [document_create_siblings]) +register_links(['sources:document_create', 'sources:document_create_multiple', 'sources:upload_interactive', 'sources:staging_file_delete'], [document_list_recent, document_list], menu_name='secondary_menu') + +register_setup(setup_sources) APIEndPoint('sources') From fafd84b8d254c69cad0579a09ce876a4471d1102 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 14 Jan 2015 18:47:31 -0400 Subject: [PATCH 04/20] Move magic number variable to the literls.py module --- mayan/apps/ocr/literals.py | 1 + mayan/apps/ocr/tasks.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/mayan/apps/ocr/literals.py b/mayan/apps/ocr/literals.py index 7c0e6d3531..8e80534272 100644 --- a/mayan/apps/ocr/literals.py +++ b/mayan/apps/ocr/literals.py @@ -1,3 +1,4 @@ DEFAULT_OCR_FILE_FORMAT = u'tiff' DEFAULT_OCR_FILE_EXTENSION = u'tif' +LOCK_EXPIRE = 60 * 10 # Adjust to worst case scenario UNPAPER_FILE_FORMAT = u'ppm' diff --git a/mayan/apps/ocr/tasks.py b/mayan/apps/ocr/tasks.py index 10552dcdef..0f506e103f 100644 --- a/mayan/apps/ocr/tasks.py +++ b/mayan/apps/ocr/tasks.py @@ -12,10 +12,10 @@ from lock_manager import Lock, LockError from mayan.celery import app from .api import do_document_ocr +from .literals import LOCK_EXPIRE from .models import DocumentQueue, QueueDocument logger = logging.getLogger(__name__) -LOCK_EXPIRE = 60 * 10 # Adjust to worst case scenario @app.task(ignore_result=True) From b22bb55cbc475b0dbd2e5087f071a631df0029d3 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 14 Jan 2015 18:48:00 -0400 Subject: [PATCH 05/20] User a direct objects.create no need for a separate .save() call --- mayan/apps/documents/models.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py index 75699f09d9..52f8eda778 100644 --- a/mayan/apps/documents/models.py +++ b/mayan/apps/documents/models.py @@ -193,12 +193,11 @@ class Document(models.Model): if not self.is_new_versions_allowed(user=user): raise NewDocumentVersionNotAllowed - new_version = DocumentVersion( + new_version = DocumentVersion.objects.create( document=self, file=file_object, comment=comment or '', ) - new_version.save() logger.debug('new_version saved') From 99316b3deaa4da8d2906f11d0f2dd82854d0b1ed Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 14 Jan 2015 18:56:32 -0400 Subject: [PATCH 06/20] Show document of version being sent for OCR in logger output --- mayan/apps/ocr/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mayan/apps/ocr/__init__.py b/mayan/apps/ocr/__init__.py index 7f16bfad7d..c4be0a4539 100644 --- a/mayan/apps/ocr/__init__.py +++ b/mayan/apps/ocr/__init__.py @@ -41,7 +41,7 @@ def document_ocr_submit(self): @receiver(post_version_upload, dispatch_uid='post_version_upload_ocr', sender=DocumentVersion) def post_version_upload_ocr(sender, instance, **kwargs): logger.debug('received post_version_upload') - logger.debug('instance: %s', instance) + logger.debug('instance.document: %s', instance.document) if instance.document.document_type.ocr: instance.document.submit_for_ocr() From 769d53698583fbdf8a78c5fd5d28652831e1c270 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 02:54:19 -0400 Subject: [PATCH 07/20] Add source -> documents migration dependency check --- mayan/apps/sources/south_migrations/0007_set_doc_type.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mayan/apps/sources/south_migrations/0007_set_doc_type.py b/mayan/apps/sources/south_migrations/0007_set_doc_type.py index dfc681b1cb..1ddb7c9b3e 100644 --- a/mayan/apps/sources/south_migrations/0007_set_doc_type.py +++ b/mayan/apps/sources/south_migrations/0007_set_doc_type.py @@ -13,6 +13,9 @@ def fake_get_or_create(model, *args, **kwargs): class Migration(DataMigration): + needed_by = ( + ('documents', '0024_auto__add_field_documenttype_ocr'), + ) def forwards(self, orm): "Write your forwards methods here." From 747dda21c8fdfc10f817ab1ba482a2297f4ee629 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 02:54:47 -0400 Subject: [PATCH 08/20] Add __unicode__ method for DocumentVersion model --- mayan/apps/documents/models.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py index 52f8eda778..d698df3f0e 100644 --- a/mayan/apps/documents/models.py +++ b/mayan/apps/documents/models.py @@ -300,6 +300,9 @@ class DocumentVersion(models.Model): verbose_name = _(u'Document version') verbose_name_plural = _(u'Document version') + def __unicode__(self): + return u'{0} - {1}'.format(self.document, self.timestamp) + def save(self, *args, **kwargs): """ Overloaded save method that updates the document version's checksum, From 2371d3a49d8f99dd991404ca3928ed570d773dbe Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 02:55:17 -0400 Subject: [PATCH 09/20] Use lazy translation for events model --- mayan/apps/events/models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mayan/apps/events/models.py b/mayan/apps/events/models.py index a7de6e58f3..75ee75c1a9 100644 --- a/mayan/apps/events/models.py +++ b/mayan/apps/events/models.py @@ -2,7 +2,7 @@ from __future__ import unicode_literals from django.db import models from django.utils.encoding import python_2_unicode_compatible -from django.utils.translation import ugettext as _ +from django.utils.translation import ugettext_lazy as _ from actstream.models import Action From e6754c9a6f036441c26bee077ce36c8c72353ec8 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 03:01:43 -0400 Subject: [PATCH 10/20] Update the OCR app to work based on document versions not documents, document version are the module which hold the document pages instances. Remove old OCR document queue and replace with a single module for OCR processing error entries. Increase compatibility with Django 1.7 and Python 3. --- mayan/apps/ocr/__init__.py | 60 ++-- mayan/apps/ocr/admin.py | 19 +- mayan/apps/ocr/api.py | 18 +- mayan/apps/ocr/api_views.py | 18 +- mayan/apps/ocr/backends/__init__.py | 2 +- mayan/apps/ocr/backends/tesseract.py | 6 +- mayan/apps/ocr/exceptions.py | 3 + mayan/apps/ocr/lang/deu.py | 2 +- mayan/apps/ocr/lang/eng.py | 2 +- mayan/apps/ocr/lang/rus.py | 2 +- mayan/apps/ocr/lang/spa.py | 2 +- mayan/apps/ocr/links.py | 19 +- mayan/apps/ocr/literals.py | 8 +- mayan/apps/ocr/models.py | 41 +-- mayan/apps/ocr/parsers/__init__.py | 12 +- mayan/apps/ocr/permissions.py | 10 +- mayan/apps/ocr/runtime.py | 2 - mayan/apps/ocr/serializers.py | 6 +- mayan/apps/ocr/settings.py | 14 +- ...euedocument__add_documentversionocrerro.py | 88 ++++++ mayan/apps/ocr/tasks.py | 48 ++- mayan/apps/ocr/tests.py | 9 +- mayan/apps/ocr/urls.py | 22 +- mayan/apps/ocr/views.py | 290 ++++++++---------- 24 files changed, 375 insertions(+), 328 deletions(-) create mode 100644 mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py diff --git a/mayan/apps/ocr/__init__.py b/mayan/apps/ocr/__init__.py index c4be0a4539..4f4e79d184 100644 --- a/mayan/apps/ocr/__init__.py +++ b/mayan/apps/ocr/__init__.py @@ -1,61 +1,75 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import logging from django.dispatch import receiver from django.utils.translation import ugettext_lazy as _ -from south.signals import post_migrate - from acls.api import class_permissions +from common.utils import encapsulate from documents.models import Document, DocumentVersion from documents.signals import post_version_upload +from documents.widgets import document_link from main.api import register_maintenance_links -from navigation.api import register_links +from navigation.api import register_links, register_model_list_columns from navigation.links import link_spacer from project_tools.api import register_tool from rest_api.classes import APIEndPoint -from .links import (all_document_ocr_cleanup, ocr_tool_link, - queue_document_list, queue_document_multiple_delete, - re_queue_multiple_document, submit_document, - submit_document_multiple) -from .models import DocumentQueue +from .links import ( + link_document_all_ocr_cleanup, link_document_submit, + link_document_submit_multiple, link_entry_delete, + link_entry_delete_multiple, link_entry_list, link_entry_re_queue, + link_entry_re_queue_multiple +) +from .models import DocumentVersionOCRError from .permissions import PERMISSION_OCR_DOCUMENT from .tasks import task_do_ocr logger = logging.getLogger(__name__) -register_links(Document, [submit_document]) -register_links([Document], [submit_document_multiple, link_spacer], menu_name='multi_item_links') -register_links(['ocr:queue_document_list'], [re_queue_multiple_document, queue_document_multiple_delete]) -register_links(['ocr:queue_document_list'], [queue_document_list], menu_name='secondary_menu') +register_links(Document, [link_document_submit]) +register_links([Document], [link_document_submit_multiple, link_spacer], menu_name='multi_item_links') -register_maintenance_links([all_document_ocr_cleanup], namespace='ocr', title=_(u'OCR')) +register_links([DocumentVersionOCRError], [link_entry_re_queue_multiple, link_entry_delete_multiple, link_spacer], menu_name='multi_item_links') +register_links([DocumentVersionOCRError], [link_entry_re_queue, link_entry_delete]) +register_links(['ocr:entry_list', 'ocr:entry_delete_multiple', 'ocr:entry_re_queue_multiple', DocumentVersionOCRError], [link_entry_list], menu_name='secondary_menu') +register_maintenance_links([link_document_all_ocr_cleanup], namespace='ocr', title=_('OCR')) def document_ocr_submit(self): task_do_ocr.apply_async(args=[self.pk], queue='ocr') +def document_version_ocr_submit(self): + task_do_ocr.apply_async(args=[self.document.pk], queue='ocr') + + @receiver(post_version_upload, dispatch_uid='post_version_upload_ocr', sender=DocumentVersion) def post_version_upload_ocr(sender, instance, **kwargs): logger.debug('received post_version_upload') - logger.debug('instance.document: %s', instance.document) + logger.debug('instance pk: %s', instance.pk) if instance.document.document_type.ocr: - instance.document.submit_for_ocr() - - -@receiver(post_migrate, dispatch_uid='create_default_queue') -def create_default_queue_signal_handler(sender, **kwargs): - if kwargs['app'] == 'ocr': - DocumentQueue.objects.get_or_create(name='default') + instance.submit_for_ocr() Document.add_to_class('submit_for_ocr', document_ocr_submit) +DocumentVersion.add_to_class('submit_for_ocr', document_version_ocr_submit) class_permissions(Document, [PERMISSION_OCR_DOCUMENT]) -register_tool(ocr_tool_link) +register_tool(link_entry_list) APIEndPoint('ocr') + +register_model_list_columns(DocumentVersionOCRError, [ + { + 'name': _('Document'), 'attribute': encapsulate(lambda entry: document_link(entry.document_version.document)) + }, + { + 'name': _('Added'), 'attribute': 'datetime_submitted' + }, + { + 'name': _('Result'), 'attribute': 'result' + }, +]) diff --git a/mayan/apps/ocr/admin.py b/mayan/apps/ocr/admin.py index 7cae1df462..cd434cc85f 100644 --- a/mayan/apps/ocr/admin.py +++ b/mayan/apps/ocr/admin.py @@ -1,20 +1,13 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.contrib import admin -from .models import DocumentQueue, QueueDocument +from .models import DocumentVersionOCRError -class QueueDocumentInline(admin.StackedInline): - model = QueueDocument - extra = 1 - classes = ('collapse-open',) - allow_add = True +class DocumentVersionOCRErrorAdmin(admin.ModelAdmin): + list_display = ('document_version', 'datetime_submitted') + readonly_fields = ('document_version', 'datetime_submitted', 'result') -class DocumentQueueAdmin(admin.ModelAdmin): - inlines = [QueueDocumentInline] - list_display = ('name', 'label') - - -admin.site.register(DocumentQueue, DocumentQueueAdmin) +admin.site.register(DocumentVersionOCRError, DocumentVersionOCRErrorAdmin) diff --git a/mayan/apps/ocr/api.py b/mayan/apps/ocr/api.py index f0189a09df..b5e772019a 100644 --- a/mayan/apps/ocr/api.py +++ b/mayan/apps/ocr/api.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import logging import os @@ -30,14 +30,14 @@ except sh.CommandNotFound: UNPAPER = None -def do_document_ocr(document): +def do_document_ocr(document_version): """ Try first to extract text from document pages using the registered parser, if the parser fails or if there is no parser registered for the document mimetype do a visual OCR by calling the corresponding OCR backend """ - for document_page in document.pages.all(): + for document_page in document_version.pages.all(): try: # Try to extract text by means of a parser parse_document_page(document_page) @@ -68,10 +68,10 @@ def do_document_ocr(document): os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: - ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document.language) + ocr_text = ocr_backend.execute(pre_ocr_filepath_w_ext, document_version.document.language) - document_page.content = ocr_cleanup(document.language, ocr_text) - document_page.page_label = _(u'Text from OCR') + document_page.content = ocr_cleanup(document_version.document.language, ocr_text) + document_page.page_label = _('Text from OCR') document_page.save() finally: fs_cleanup(pre_ocr_filepath_w_ext) @@ -86,7 +86,7 @@ def ocr_cleanup(language, text): cleanup filter """ try: - language_backend = load_backend(u'.'.join([u'ocr', u'lang', language, u'LanguageBackend']))() + language_backend = load_backend('.'.join(['ocr', 'lang', language, 'LanguageBackend']))() except ImportError: language_backend = None @@ -104,9 +104,9 @@ def ocr_cleanup(language, text): result = word if result: output.append(result) - output.append(u'\n') + output.append('\n') - return u' '.join(output) + return ' '.join(output) def clean_pages(): diff --git a/mayan/apps/ocr/api_views.py b/mayan/apps/ocr/api_views.py index 220339e54f..75d5bb338a 100644 --- a/mayan/apps/ocr/api_views.py +++ b/mayan/apps/ocr/api_views.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals from django.core.exceptions import PermissionDenied from django.shortcuts import get_object_or_404 @@ -8,33 +8,33 @@ from rest_framework.response import Response from rest_framework.settings import api_settings from acls.models import AccessEntry -from documents.models import Document +from documents.models import DocumentVersion from permissions.models import Permission from rest_api.permissions import MayanPermission from .permissions import PERMISSION_OCR_DOCUMENT -from .serializers import DocumentOCRSerializer +from .serializers import DocumentVersionOCRSerializer -class DocumentOCRView(generics.GenericAPIView): - serializer_class = DocumentOCRSerializer +class DocumentVersionOCRView(generics.GenericAPIView): + serializer_class = DocumentVersionOCRSerializer permission_classes = (MayanPermission,) def post(self, request, *args, **kwargs): - """Submit document OCR queue.""" + """Submit document version for OCR.""" serializer = self.get_serializer(data=request.DATA, files=request.FILES) if serializer.is_valid(): - document = get_object_or_404(Document, pk=serializer.data['document_id']) + document_version = get_object_or_404(DocumentVersion, pk=serializer.data['document_version_id']) try: Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) except PermissionDenied: - AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document) + AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document_version.document) - document.submit_for_ocr() + document_version.submit_for_ocr() headers = self.get_success_headers(serializer.data) return Response(serializer.data, status=status.HTTP_202_ACCEPTED, diff --git a/mayan/apps/ocr/backends/__init__.py b/mayan/apps/ocr/backends/__init__.py index f6e245ceb8..6558a75c85 100644 --- a/mayan/apps/ocr/backends/__init__.py +++ b/mayan/apps/ocr/backends/__init__.py @@ -1,3 +1,3 @@ class BackendBase(object): - def execute(self, input_filename, language=None): # NOQA + def execute(self, input_filename, language=None): raise NotImplementedError diff --git a/mayan/apps/ocr/backends/tesseract.py b/mayan/apps/ocr/backends/tesseract.py index f0c34477c0..e36b4c043a 100644 --- a/mayan/apps/ocr/backends/tesseract.py +++ b/mayan/apps/ocr/backends/tesseract.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import codecs import errno @@ -20,11 +20,11 @@ class Tesseract(BackendBase): """ fd, filepath = tempfile.mkstemp() os.close(fd) - ocr_output = os.extsep.join([filepath, u'txt']) + ocr_output = os.extsep.join([filepath, 'txt']) command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] if language is not None: - command.extend([u'-l', language]) + command.extend(['-l', language]) try: proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) diff --git a/mayan/apps/ocr/exceptions.py b/mayan/apps/ocr/exceptions.py index 5497c92ea5..123f52160f 100644 --- a/mayan/apps/ocr/exceptions.py +++ b/mayan/apps/ocr/exceptions.py @@ -1,3 +1,6 @@ +from __future__ import unicode_literals + + class OCRError(Exception): """ Raised by the OCR backend diff --git a/mayan/apps/ocr/lang/deu.py b/mayan/apps/ocr/lang/deu.py index a3ca0383e9..ccff3eba7d 100644 --- a/mayan/apps/ocr/lang/deu.py +++ b/mayan/apps/ocr/lang/deu.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/lang/eng.py b/mayan/apps/ocr/lang/eng.py index 29dc3384e8..5025db136d 100644 --- a/mayan/apps/ocr/lang/eng.py +++ b/mayan/apps/ocr/lang/eng.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/lang/rus.py b/mayan/apps/ocr/lang/rus.py index 05ce0e1ab1..e7b7588358 100644 --- a/mayan/apps/ocr/lang/rus.py +++ b/mayan/apps/ocr/lang/rus.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/lang/spa.py b/mayan/apps/ocr/lang/spa.py index eb4d9ead45..c736a69b9a 100644 --- a/mayan/apps/ocr/lang/spa.py +++ b/mayan/apps/ocr/lang/spa.py @@ -1,5 +1,5 @@ # -*- coding: utf-8 -*- -from __future__ import absolute_import +from __future__ import unicode_literals import re diff --git a/mayan/apps/ocr/links.py b/mayan/apps/ocr/links.py index eb41e2b9d7..f41743f121 100644 --- a/mayan/apps/ocr/links.py +++ b/mayan/apps/ocr/links.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.utils.translation import ugettext_lazy as _ @@ -6,14 +6,13 @@ from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE) -submit_document = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -submit_document_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:submit_document_multiple', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -re_queue_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_document', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -re_queue_multiple_document = {'text': _('Re-queue'), 'view': 'ocr:re_queue_multiple_document', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} -queue_document_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} -queue_document_multiple_delete = {'text': _(u'Delete'), 'view': 'ocr:queue_document_multiple_delete', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} +link_document_submit = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} +link_document_submit_multiple = {'text': _('Submit to OCR queue'), 'view': 'ocr:document_submit_multiple', 'famfam': 'hourglass_add'} +link_entry_re_queue = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue', 'args': 'object.id', 'famfam': 'hourglass_add', 'permissions': [PERMISSION_OCR_DOCUMENT]} +link_entry_re_queue_multiple = {'text': _('Re-queue'), 'view': 'ocr:entry_re_queue_multiple', 'famfam': 'hourglass_add'} +link_entry_delete = {'text': _('Delete'), 'view': 'ocr:entry_delete', 'args': 'object.id', 'famfam': 'hourglass_delete', 'permissions': [PERMISSION_OCR_DOCUMENT_DELETE]} +link_entry_delete_multiple = {'text': _('Delete'), 'view': 'ocr:entry_delete_multiple', 'famfam': 'hourglass_delete'} -all_document_ocr_cleanup = {'text': _(u'Clean up pages content'), 'view': 'ocr:all_document_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _(u'Runs a language filter to remove common OCR mistakes from document pages content.')} +link_document_all_ocr_cleanup = {'text': _('Clean up pages content'), 'view': 'ocr:document_all_ocr_cleanup', 'famfam': 'text_strikethrough', 'permissions': [PERMISSION_OCR_CLEAN_ALL_PAGES], 'description': _('Runs a language filter to remove common OCR mistakes from document pages content.')} -queue_document_list = {'text': _(u'Queue document list'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'permissions': [PERMISSION_OCR_DOCUMENT]} -ocr_tool_link = {'text': _(u'OCR'), 'view': 'ocr:queue_document_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]} +link_entry_list = {'text': _('OCR Errors'), 'view': 'ocr:entry_list', 'famfam': 'hourglass', 'icon': 'text.png', 'permissions': [PERMISSION_OCR_DOCUMENT]} diff --git a/mayan/apps/ocr/literals.py b/mayan/apps/ocr/literals.py index 8e80534272..3a7b1360dc 100644 --- a/mayan/apps/ocr/literals.py +++ b/mayan/apps/ocr/literals.py @@ -1,4 +1,6 @@ -DEFAULT_OCR_FILE_FORMAT = u'tiff' -DEFAULT_OCR_FILE_EXTENSION = u'tif' +from __future__ import unicode_literals + +DEFAULT_OCR_FILE_FORMAT = 'tiff' +DEFAULT_OCR_FILE_EXTENSION = 'tif' LOCK_EXPIRE = 60 * 10 # Adjust to worst case scenario -UNPAPER_FILE_FORMAT = u'ppm' +UNPAPER_FILE_FORMAT = 'ppm' diff --git a/mayan/apps/ocr/models.py b/mayan/apps/ocr/models.py index 8533dcea2a..e4c1713eb9 100644 --- a/mayan/apps/ocr/models.py +++ b/mayan/apps/ocr/models.py @@ -1,39 +1,22 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.db import models -from django.core.exceptions import ObjectDoesNotExist -from django.utils.translation import ugettext +from django.utils.encoding import python_2_unicode_compatible from django.utils.translation import ugettext_lazy as _ -from documents.models import Document +from documents.models import DocumentVersion -class DocumentQueue(models.Model): - name = models.CharField(max_length=64, unique=True, verbose_name=_(u'Name')) - label = models.CharField(max_length=64, verbose_name=_(u'Label')) +@python_2_unicode_compatible +class DocumentVersionOCRError(models.Model): + document_version = models.ForeignKey(DocumentVersion, verbose_name=_('Document version')) + datetime_submitted = models.DateTimeField(verbose_name=_('Date time submitted'), auto_now=True, db_index=True) + result = models.TextField(blank=True, null=True, verbose_name=_('Result')) - class Meta: - verbose_name = _(u'Document queue') - verbose_name_plural = _(u'Document queues') - - def __unicode__(self): - return self.label - - -class QueueDocument(models.Model): - document_queue = models.ForeignKey(DocumentQueue, related_name='documents', verbose_name=_(u'Document queue')) - document = models.ForeignKey(Document, verbose_name=_(u'Document')) - datetime_submitted = models.DateTimeField(verbose_name=_(u'Date time submitted'), auto_now=True, db_index=True) - result = models.TextField(blank=True, null=True, verbose_name=_(u'Result')) - node_name = models.CharField(max_length=256, verbose_name=_(u'Node name'), blank=True, null=True) + def __str__(self): + return unicode(self.document_version) class Meta: ordering = ('datetime_submitted',) - verbose_name = _(u'Queue document') - verbose_name_plural = _(u'Queue documents') - - def __unicode__(self): - try: - return unicode(self.document) - except ObjectDoesNotExist: - return ugettext(u'Missing document.') + verbose_name = _('Document Version OCR Error') + verbose_name_plural = _('Document Version OCR Errors') diff --git a/mayan/apps/ocr/parsers/__init__.py b/mayan/apps/ocr/parsers/__init__.py index d599103b15..f505b3c531 100644 --- a/mayan/apps/ocr/parsers/__init__.py +++ b/mayan/apps/ocr/parsers/__init__.py @@ -1,3 +1,5 @@ +from __future__ import unicode_literals + import logging import os import slate @@ -90,7 +92,7 @@ class SlateParser(Parser): raise ParserError document_page.content = pdf_pages[document_page.page_number - 1] - document_page.page_label = _(u'Text extracted from PDF') + document_page.page_label = _('Text extracted from PDF') document_page.save() @@ -112,7 +114,7 @@ class OfficeParser(Parser): # Now that the office document has been converted to PDF # call the coresponding PDF parser in this new file - parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf') + parse_document_page(document_page, descriptor=open(input_filepath), mimetype='application/pdf') else: raise ParserError @@ -126,7 +128,7 @@ class PopplerParser(Parser): PDF parser using the pdftotext execute from the poppler package """ def __init__(self): - self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else u'/usr/bin/pdftotext' + self.pdftotext_path = PDFTOTEXT_PATH if PDFTOTEXT_PATH else '/usr/bin/pdftotext' if not os.path.exists(self.pdftotext_path): raise ParserError('cannot find pdftotext executable') logger.debug('self.pdftotext_path: %s', self.pdftotext_path) @@ -167,9 +169,9 @@ class PopplerParser(Parser): raise ParserError('No output') document_page.content = output - document_page.page_label = _(u'Text extracted from PDF') + document_page.page_label = _('Text extracted from PDF') document_page.save() -register_parser(mimetypes=[u'application/pdf'], parsers=[PopplerParser, SlateParser]) +register_parser(mimetypes=['application/pdf'], parsers=[PopplerParser, SlateParser]) register_parser(mimetypes=office_converter.CONVERTER_OFFICE_FILE_MIMETYPES, parsers=[OfficeParser]) diff --git a/mayan/apps/ocr/permissions.py b/mayan/apps/ocr/permissions.py index e8dbc188e7..b6bf977a6c 100644 --- a/mayan/apps/ocr/permissions.py +++ b/mayan/apps/ocr/permissions.py @@ -1,10 +1,10 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals from django.utils.translation import ugettext_lazy as _ from permissions.models import Permission, PermissionNamespace -ocr_namespace = PermissionNamespace('ocr', _(u'OCR')) -PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _(u'Submit documents for OCR')) -PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _(u'Delete documents from OCR queue')) -PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _(u'Can execute the OCR clean up on all document pages')) +ocr_namespace = PermissionNamespace('ocr', _('OCR')) +PERMISSION_OCR_DOCUMENT = Permission.objects.register(ocr_namespace, 'ocr_document', _('Submit documents for OCR')) +PERMISSION_OCR_DOCUMENT_DELETE = Permission.objects.register(ocr_namespace, 'ocr_document_delete', _('Delete documents from OCR queue')) +PERMISSION_OCR_CLEAN_ALL_PAGES = Permission.objects.register(ocr_namespace, 'ocr_clean_all_pages', _('Can execute the OCR clean up on all document pages')) diff --git a/mayan/apps/ocr/runtime.py b/mayan/apps/ocr/runtime.py index eef63478c0..78aef88077 100644 --- a/mayan/apps/ocr/runtime.py +++ b/mayan/apps/ocr/runtime.py @@ -1,5 +1,3 @@ -from __future__ import absolute_import - from common.utils import load_backend from .settings import BACKEND diff --git a/mayan/apps/ocr/serializers.py b/mayan/apps/ocr/serializers.py index c38fb42f7c..9640050fb8 100644 --- a/mayan/apps/ocr/serializers.py +++ b/mayan/apps/ocr/serializers.py @@ -1,7 +1,5 @@ -from __future__ import absolute_import - from rest_framework import serializers -class DocumentOCRSerializer(serializers.Serializer): - document_id = serializers.IntegerField() +class DocumentVersionOCRSerializer(serializers.Serializer): + document_version_id = serializers.IntegerField() diff --git a/mayan/apps/ocr/settings.py b/mayan/apps/ocr/settings.py index ddfec29592..7bde2b13bb 100644 --- a/mayan/apps/ocr/settings.py +++ b/mayan/apps/ocr/settings.py @@ -1,16 +1,16 @@ -"""Configuration options for the ocr app""" +from __future__ import unicode_literals from django.utils.translation import ugettext_lazy as _ from smart_settings.api import register_settings register_settings( - namespace=u'ocr', - module=u'ocr.settings', + namespace='ocr', + module='ocr.settings', settings=[ - {'name': u'TESSERACT_PATH', 'global_name': u'OCR_TESSERACT_PATH', 'default': u'/usr/bin/tesseract', 'exists': True}, - {'name': u'UNPAPER_PATH', 'global_name': u'OCR_UNPAPER_PATH', 'default': u'/usr/bin/unpaper', 'description': _(u'File path to unpaper program.'), 'exists': True}, - {'name': u'PDFTOTEXT_PATH', 'global_name': u'OCR_PDFTOTEXT_PATH', 'default': u'/usr/bin/pdftotext', 'description': _(u'File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, - {'name': u'BACKEND', 'global_name': u'OCR_BACKEND', 'default': u'ocr.backends.tesseract.Tesseract', 'description': _(u'Full path to the backend to be used to do OCR.')}, + {'name': 'TESSERACT_PATH', 'global_name': 'OCR_TESSERACT_PATH', 'default': '/usr/bin/tesseract', 'exists': True}, + {'name': 'UNPAPER_PATH', 'global_name': 'OCR_UNPAPER_PATH', 'default': '/usr/bin/unpaper', 'description': _('File path to unpaper program.'), 'exists': True}, + {'name': 'PDFTOTEXT_PATH', 'global_name': 'OCR_PDFTOTEXT_PATH', 'default': '/usr/bin/pdftotext', 'description': _('File path to poppler\'s pdftotext program used to extract text from PDF files.'), 'exists': True}, + {'name': 'BACKEND', 'global_name': 'OCR_BACKEND', 'default': 'ocr.backends.tesseract.Tesseract', 'description': _('Full path to the backend to be used to do OCR.')}, ] ) diff --git a/mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py b/mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py new file mode 100644 index 0000000000..ff4777e7ca --- /dev/null +++ b/mayan/apps/ocr/south_migrations/0004_auto__del_documentqueue__del_queuedocument__add_documentversionocrerro.py @@ -0,0 +1,88 @@ +# -*- coding: utf-8 -*- +from south.utils import datetime_utils as datetime +from south.db import db +from south.v2 import SchemaMigration +from django.db import models + + +class Migration(SchemaMigration): + + def forwards(self, orm): + # Deleting model 'DocumentQueue' + db.delete_table(u'ocr_documentqueue') + + # Deleting model 'QueueDocument' + db.delete_table(u'ocr_queuedocument') + + # Adding model 'DocumentVersionOCRError' + db.create_table(u'ocr_documentversionocrerror', ( + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + ('document_version', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.DocumentVersion'])), + ('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, db_index=True, blank=True)), + ('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)), + )) + db.send_create_signal(u'ocr', ['DocumentVersionOCRError']) + + + def backwards(self, orm): + # Adding model 'DocumentQueue' + db.create_table(u'ocr_documentqueue', ( + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + ('name', self.gf('django.db.models.fields.CharField')(max_length=64, unique=True)), + ('label', self.gf('django.db.models.fields.CharField')(max_length=64)), + )) + db.send_create_signal(u'ocr', ['DocumentQueue']) + + # Adding model 'QueueDocument' + db.create_table(u'ocr_queuedocument', ( + ('node_name', self.gf('django.db.models.fields.CharField')(max_length=256, null=True, blank=True)), + ('result', self.gf('django.db.models.fields.TextField')(null=True, blank=True)), + ('datetime_submitted', self.gf('django.db.models.fields.DateTimeField')(auto_now=True, blank=True, db_index=True)), + ('document_queue', self.gf('django.db.models.fields.related.ForeignKey')(related_name='documents', to=orm['ocr.DocumentQueue'])), + ('document', self.gf('django.db.models.fields.related.ForeignKey')(to=orm['documents.Document'])), + (u'id', self.gf('django.db.models.fields.AutoField')(primary_key=True)), + )) + db.send_create_signal(u'ocr', ['QueueDocument']) + + # Deleting model 'DocumentVersionOCRError' + db.delete_table(u'ocr_documentversionocrerror') + + + models = { + u'documents.document': { + 'Meta': {'ordering': "['-date_added']", 'object_name': 'Document'}, + 'date_added': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}), + 'description': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'document_type': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'documents'", 'to': u"orm['documents.DocumentType']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'label': ('django.db.models.fields.CharField', [], {'default': "u'Uninitialized document'", 'max_length': '255', 'db_index': 'True'}), + 'language': ('django.db.models.fields.CharField', [], {'default': "u'eng'", 'max_length': '8'}), + 'uuid': ('django.db.models.fields.CharField', [], {'default': "u'b5b498b5-ffe5-4b70-b8a6-6c875ed11bf2'", 'max_length': '48'}) + }, + u'documents.documenttype': { + 'Meta': {'ordering': "['name']", 'object_name': 'DocumentType'}, + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'name': ('django.db.models.fields.CharField', [], {'unique': 'True', 'max_length': '32'}), + 'ocr': ('django.db.models.fields.BooleanField', [], {'default': 'True'}) + }, + u'documents.documentversion': { + 'Meta': {'object_name': 'DocumentVersion'}, + 'checksum': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}), + 'comment': ('django.db.models.fields.TextField', [], {'blank': 'True'}), + 'document': ('django.db.models.fields.related.ForeignKey', [], {'related_name': "'versions'", 'to': u"orm['documents.Document']"}), + 'encoding': ('django.db.models.fields.CharField', [], {'max_length': '64', 'null': 'True', 'blank': 'True'}), + 'file': ('django.db.models.fields.files.FileField', [], {'max_length': '100'}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'mimetype': ('django.db.models.fields.CharField', [], {'max_length': '255', 'null': 'True', 'blank': 'True'}), + 'timestamp': ('django.db.models.fields.DateTimeField', [], {'auto_now_add': 'True', 'blank': 'True'}) + }, + u'ocr.documentversionocrerror': { + 'Meta': {'ordering': "('datetime_submitted',)", 'object_name': 'DocumentVersionOCRError'}, + 'datetime_submitted': ('django.db.models.fields.DateTimeField', [], {'auto_now': 'True', 'db_index': 'True', 'blank': 'True'}), + 'document_version': ('django.db.models.fields.related.ForeignKey', [], {'to': u"orm['documents.DocumentVersion']"}), + u'id': ('django.db.models.fields.AutoField', [], {'primary_key': 'True'}), + 'result': ('django.db.models.fields.TextField', [], {'null': 'True', 'blank': 'True'}) + } + } + + complete_apps = ['ocr'] \ No newline at end of file diff --git a/mayan/apps/ocr/tasks.py b/mayan/apps/ocr/tasks.py index 0f506e103f..a7bebddc68 100644 --- a/mayan/apps/ocr/tasks.py +++ b/mayan/apps/ocr/tasks.py @@ -1,65 +1,61 @@ -from __future__ import absolute_import +from __future__ import unicode_literals import logging -import platform import sys import traceback from django.conf import settings -from documents.models import Document +from documents.models import DocumentVersion from lock_manager import Lock, LockError from mayan.celery import app from .api import do_document_ocr from .literals import LOCK_EXPIRE -from .models import DocumentQueue, QueueDocument +from .models import DocumentVersionOCRError logger = logging.getLogger(__name__) @app.task(ignore_result=True) -def task_do_ocr(document_pk): - lock_id = u'task_do_ocr_doc-%d' % document_pk +def task_do_ocr(document_version_pk): + lock_id = 'task_do_ocr_doc_version-%d' % document_version_pk try: logger.debug('trying to acquire lock: %s', lock_id) - # Acquire lock to avoid doing OCR on the same document more than once - # concurrently + # Acquire lock to avoid doing OCR on the same document version more than + # once concurrently lock = Lock.acquire_lock(lock_id, LOCK_EXPIRE) logger.debug('acquired lock: %s', lock_id) - document = None + document_version = None try: - logger.info('Starting document OCR for document: %d', document_pk) - document = Document.objects.get(pk=document_pk) - do_document_ocr(document) + logger.info('Starting document OCR for document version: %d', document_version_pk) + document_version = DocumentVersion.objects.get(pk=document_version_pk) + do_document_ocr(document_version) except Exception as exception: - logger.error('OCR error for document: %d; %s', document_pk, exception) - document_queue = DocumentQueue.objects.get(name='default') - if document: - queue_document, created = document_queue.documents.get_or_create(document=document) - queue_document.node_name = platform.node() + logger.error('OCR error for document version: %d; %s', document_version_pk, exception) + if document_version: + entry, created = DocumentVersionOCRError.objects.get_or_create(document_version=document_version) if settings.DEBUG: result = [] type, value, tb = sys.exc_info() result.append('%s: %s' % (type.__name__, value)) result.extend(traceback.format_tb(tb)) - queue_document.result = '\n'.join(result) + entry.result = '\n'.join(result) else: - queue_document.result = exception + entry.result = exception - queue_document.save() + entry.save() else: - logger.info('OCR for document: %d ended', document_pk) - document_queue = DocumentQueue.objects.get(name='default') + logger.info('OCR for document: %d ended', document_version_pk) try: - queue_document = document_queue.documents.get(document=document) - except QueueDocument.DoesNotExist: + entry = DocumentVersionOCRError.objects.get(document_version=document_version) + except DocumentVersionOCRError.DoesNotExist: pass else: - queue_document.delete() + entry.delete() finally: lock.release() except LockError: - logger.debug('unable to obtain lock') + logger.debug('unable to obtain lock: %s' % lock_id) pass diff --git a/mayan/apps/ocr/tests.py b/mayan/apps/ocr/tests.py index a6f48f4c4a..efa0438838 100644 --- a/mayan/apps/ocr/tests.py +++ b/mayan/apps/ocr/tests.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import unicode_literals from django.core.files.base import File from django.test import TransactionTestCase @@ -6,8 +6,6 @@ from django.test import TransactionTestCase from documents.models import Document, DocumentType from documents.tests import TEST_SMALL_DOCUMENT_PATH, TEST_DOCUMENT_TYPE -from .models import DocumentQueue, QueueDocument - class DocumentOCRTestCase(TransactionTestCase): def setUp(self): @@ -16,11 +14,6 @@ class DocumentOCRTestCase(TransactionTestCase): with open(TEST_SMALL_DOCUMENT_PATH) as file_object: self.document = Document.objects.new_document(file_object=File(file_object), document_type=self.document_type)[0].document - DocumentQueue.objects.get_or_create(name='default') - - # Clear OCR queue - QueueDocument.objects.all().delete() - def _test_ocr_language_issue_16(self, language, result): """ Reusable OCR test for a specific language diff --git a/mayan/apps/ocr/urls.py b/mayan/apps/ocr/urls.py index 4b4f5b8436..629f9e9888 100644 --- a/mayan/apps/ocr/urls.py +++ b/mayan/apps/ocr/urls.py @@ -1,19 +1,21 @@ +from __future__ import unicode_literals + from django.conf.urls import patterns, url -from .api_views import DocumentOCRView +from .api_views import DocumentVersionOCRView urlpatterns = patterns('ocr.views', - url(r'^document/(?P\d+)/submit/$', 'submit_document', (), 'submit_document'), - url(r'^document/multiple/submit/$', 'submit_document_multiple', (), 'submit_document_multiple'), - url(r'^queue/document/list/$', 'queue_document_list', (), 'queue_document_list'), - url(r'^queue/document/(?P\d+)/delete/$', 'queue_document_delete', (), 'queue_document_delete'), - url(r'^queue/document/multiple/delete/$', 'queue_document_multiple_delete', (), 'queue_document_multiple_delete'), - url(r'^queue/document/(?P\d+)/re-queue/$', 're_queue_document', (), 're_queue_document'), - url(r'^queue/document/multiple/re-queue/$', 're_queue_multiple_document', (), 're_queue_multiple_document'), + url(r'^document/(?P\d+)/submit/$', 'document_submit', (), 'document_submit'), + url(r'^document/multiple/submit/$', 'document_submit_multiple', (), 'document_submit_multiple'), + url(r'^document/all/clean_up/$', 'document_all_ocr_cleanup', (), 'document_all_ocr_cleanup'), - url(r'^document/all/clean_up/$', 'all_document_ocr_cleanup', (), 'all_document_ocr_cleanup'), + url(r'^all/$', 'entry_list', (), 'entry_list'), + url(r'^(?P\d+)/delete/$', 'entry_delete', (), 'entry_delete'), + url(r'^multiple/delete/$', 'entry_delete_multiple', (), 'entry_delete_multiple'), + url(r'^(?P\d+)/re-queue/$', 'entry_re_queue', (), 'entry_re_queue'), + url(r'^multiple/re-queue/$', 'entry_re_queue_multiple', (), 'entry_re_queue_multiple'), ) api_urls = patterns('', - url(r'^submit/$', DocumentOCRView.as_view(), name='document-ocr-submit-view'), + url(r'^submit/$', DocumentVersionOCRView.as_view(), name='document-version-ocr-submit-view'), ) diff --git a/mayan/apps/ocr/views.py b/mayan/apps/ocr/views.py index 416fd9801e..ae4b80c1a3 100644 --- a/mayan/apps/ocr/views.py +++ b/mayan/apps/ocr/views.py @@ -1,4 +1,4 @@ -from __future__ import absolute_import +from __future__ import absolute_import, unicode_literals from django.contrib import messages from django.core.exceptions import PermissionDenied @@ -6,173 +6,43 @@ from django.core.urlresolvers import reverse from django.http import HttpResponseRedirect from django.shortcuts import get_object_or_404, render_to_response from django.template import RequestContext -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext_lazy as _, ungettext from acls.models import AccessEntry -from common.utils import encapsulate -from documents.models import Document -from documents.widgets import document_link, document_thumbnail +from documents.models import Document, DocumentVersion from permissions.models import Permission from .api import clean_pages -from .models import DocumentQueue, QueueDocument +from .models import DocumentVersionOCRError from .permissions import (PERMISSION_OCR_CLEAN_ALL_PAGES, PERMISSION_OCR_DOCUMENT, PERMISSION_OCR_DOCUMENT_DELETE) -def queue_document_list(request, queue_name='default'): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) - - document_queue = get_object_or_404(DocumentQueue, name=queue_name) - - context = { - 'object_list': document_queue.documents.all(), - 'title': _(u'Documents in queue: %s') % document_queue, - 'hide_object': True, - 'queue': document_queue, - 'navigation_object_name': 'queue', - 'list_object_variable_name': 'queue_document', - 'extra_columns': [ - {'name': _('Document'), 'attribute': encapsulate(lambda x: document_link(x.document) if hasattr(x, 'document') else _(u'Missing document.'))}, - {'name': _(u'Thumbnail'), 'attribute': encapsulate(lambda x: document_thumbnail(x.document))}, - {'name': _('Added'), 'attribute': encapsulate(lambda x: unicode(x.datetime_submitted).split('.')[0]), 'keep_together':True}, - {'name': _('Node'), 'attribute': 'node_name'}, - {'name': _('Result'), 'attribute': 'result'}, - ], - } - - return render_to_response('main/generic_list.html', context, - context_instance=RequestContext(request)) - - -def queue_document_delete(request, queue_document_id=None, queue_document_id_list=None): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE]) - - if queue_document_id: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)] - elif queue_document_id_list: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')] - else: - messages.error(request, _(u'Must provide at least one queue document.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) - - if request.method == 'POST': - for queue_document in queue_documents: - try: - queue_document.delete() - messages.success(request, _(u'Queue document: %(document)s deleted successfully.') % { - 'document': queue_document.document}) - - except Exception as exception: - messages.error(request, _(u'Error deleting document: %(document)s; %(error)s') % { - 'document': queue_document, 'error': exception}) - return HttpResponseRedirect(next) - - context = { - 'next': next, - 'previous': previous, - 'delete_view': True, - } - - if len(queue_documents) == 1: - context['object'] = queue_documents[0] - context['title'] = _(u'Are you sure you wish to delete queue document: %s?') % ', '.join([unicode(d) for d in queue_documents]) - elif len(queue_documents) > 1: - context['title'] = _(u'Are you sure you wish to delete queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents]) - - return render_to_response('main/generic_confirm.html', context, - context_instance=RequestContext(request)) - - -def queue_document_multiple_delete(request): - return queue_document_delete(request, queue_document_id_list=request.GET.get('id_list', '')) - - -def submit_document_multiple(request): - for item_id in request.GET.get('id_list', '').split(','): - submit_document(request, item_id) - - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - - -def submit_document(request, document_id): - document = get_object_or_404(Document, pk=document_id) +def document_submit(request, pk): + document = get_object_or_404(Document, pk=pk) try: Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) except PermissionDenied: AccessEntry.objects.check_access(PERMISSION_OCR_DOCUMENT, request.user, document) - return submit_document_to_queue(request, document=document, - post_submit_redirect=request.META.get('HTTP_REFERER', reverse('main:home'))) - - -def submit_document_to_queue(request, document, post_submit_redirect=None): - """ - This view is meant to be reusable - """ - document.submit_for_ocr() - messages.success(request, _(u'Document: %(document)s was added to the OCR queue.') % { + messages.success(request, _('Document: %(document)s was added to the OCR queue.') % { 'document': document} ) - if post_submit_redirect: - return HttpResponseRedirect(post_submit_redirect) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) -def re_queue_document(request, queue_document_id=None, queue_document_id_list=None): - Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) +def document_submit_multiple(request): + for item_id in request.GET.get('id_list', '').split(','): + document_submit(request, item_id) - if queue_document_id: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id)] - elif queue_document_id_list: - queue_documents = [get_object_or_404(QueueDocument, pk=queue_document_id) for queue_document_id in queue_document_id_list.split(',')] - else: - messages.error(request, _(u'Must provide at least one queue document.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - - next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) - previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) - - if request.method == 'POST': - for queue_document in queue_documents: - try: - queue_document.document.submit_for_ocr() - messages.success( - request, - _(u'Document: %(document)s was re-queued for OCR.') % { - 'document': queue_document.document - } - ) - except Document.DoesNotExist: - messages.error(request, _(u'Document id#: %d, no longer exists.') % queue_document.document_id) - return HttpResponseRedirect(next) - - context = { - 'next': next, - 'previous': previous, - } - - if len(queue_documents) == 1: - context['object'] = queue_documents[0] - context['title'] = _(u'Are you sure you wish to re-queue document: %s?') % ', '.join([unicode(d) for d in queue_documents]) - elif len(queue_documents) > 1: - context['title'] = _(u'Are you sure you wish to re-queue documents: %s?') % ', '.join([unicode(d) for d in queue_documents]) - - return render_to_response('main/generic_confirm.html', context, - context_instance=RequestContext(request)) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) -def re_queue_multiple_document(request): - return re_queue_document(request, queue_document_id_list=request.GET.get('id_list', [])) - - -def all_document_ocr_cleanup(request): +def document_all_ocr_cleanup(request): Permission.objects.check_permissions(request.user, [PERMISSION_OCR_CLEAN_ALL_PAGES]) previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) @@ -182,27 +52,133 @@ def all_document_ocr_cleanup(request): return render_to_response('main/generic_confirm.html', { 'previous': previous, 'next': next, - 'title': _(u'Are you sure you wish to clean up all the pages content?'), - 'message': _(u'On large databases this operation may take some time to execute.'), + 'title': _('Are you sure you wish to clean up all the pages content?'), + 'message': _('On large databases this operation may take some time to execute.'), }, context_instance=RequestContext(request)) else: try: + # TODO: turn this into a Celery task clean_pages() - messages.success(request, _(u'Document pages content clean up complete.')) + messages.success(request, _('Document pages content clean up complete.')) except Exception as exception: - messages.error(request, _(u'Document pages content clean up error: %s') % exception) + messages.error(request, _('Document pages content clean up error: %s') % exception) return HttpResponseRedirect(next) -def display_link(obj): - output = [] - if hasattr(obj, 'get_absolute_url'): - output.append(u'%(obj)s' % { - 'url': obj.get_absolute_url(), - 'obj': obj - }) - if output: - return u''.join(output) +def entry_list(request): + Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) + + context = { + 'object_list': DocumentVersionOCRError.objects.all(), + 'title': _('OCR errors'), + 'hide_object': True, + } + + return render_to_response('main/generic_list.html', context, + context_instance=RequestContext(request)) + + +def entry_delete(request, pk=None, pk_list=None): + Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT_DELETE]) + + if pk: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)] + elif pk_list: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')] else: - return obj + messages.error(request, _('Make at least one selection.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) + + if request.method == 'POST': + for entry in entries: + try: + entry.delete() + messages.success(request, _('Entry: %(entry)s deleted successfully.') % { + 'entry': entry}) + + except Exception as exception: + messages.error(request, _('Error entry: %(entry)s; %(error)s') % { + 'entry': entry, 'error': exception}) + return HttpResponseRedirect(next) + + context = { + 'next': next, + 'previous': previous, + 'delete_view': True, + } + + if len(entries) == 1: + context['object'] = entries[0] + + context['title'] = ungettext( + 'Are you sure you wish to delete the entry: %(entry)s?', + 'Are you sure you wish to delete these %(count)d entries.', + len(entries) + ) % { + 'count': len(entries), + 'entry': entries[0], + } + + return render_to_response('main/generic_confirm.html', context, + context_instance=RequestContext(request)) + + +def entry_delete_multiple(request): + return entry_delete(request, pk_list=request.GET.get('id_list', '')) + + +def entry_re_queue(request, pk=None, pk_list=None): + Permission.objects.check_permissions(request.user, [PERMISSION_OCR_DOCUMENT]) + + if pk: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk)] + elif pk_list: + entries = [get_object_or_404(DocumentVersionOCRError, pk=pk) for pk in pk_list.split(',')] + else: + messages.error(request, _('Make at least one selection.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + next = request.POST.get('next', request.GET.get('next', request.META.get('HTTP_REFERER', None))) + previous = request.POST.get('previous', request.GET.get('previous', request.META.get('HTTP_REFERER', None))) + + if request.method == 'POST': + for entry in entries: + try: + entry.document_version.submit_for_ocr() + messages.success( + request, + _('Entry: %(entry)s was re-queued for OCR.') % { + 'entry': entry + } + ) + except DocumentVersion.DoesNotExist: + messages.error(request, _('Document version id#: %d, no longer exists.') % entry.document_version_id) + return HttpResponseRedirect(next) + + context = { + 'next': next, + 'previous': previous, + } + + if len(entries) == 1: + context['object'] = entries[0] + + context['title'] = ungettext( + 'Are you sure you wish to re-queue the entry: %(entry)s?', + 'Are you sure you wish to re-queue these %(count)d entries.', + len(entries) + ) % { + 'count': len(entries), + 'entry': entries[0], + } + + return render_to_response('main/generic_confirm.html', context, + context_instance=RequestContext(request)) + + +def entry_re_queue_multiple(request): + return entry_re_queue(request, pk_list=request.GET.get('id_list', [])) From aca7c94131340ce4bf2e31c7d73d245ac94f0694 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 03:05:22 -0400 Subject: [PATCH 11/20] Update Vagrant instructions to include -B option to workers to launch Celery Beat --- contrib/misc/mayan_edms_worker.sh | 2 +- docs/topics/development.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/misc/mayan_edms_worker.sh b/contrib/misc/mayan_edms_worker.sh index 3617829e5c..43b30c6343 100755 --- a/contrib/misc/mayan_edms_worker.sh +++ b/contrib/misc/mayan_edms_worker.sh @@ -1,3 +1,3 @@ #!/bin/sh -DJANGO_SETTINGS_MODULE='mayan.settings.celery_redis' celery -A mayan worker -l DEBUG -Q checkouts,mailing,uploads,converter,ocr,tools,indexing,metadata -Ofair +DJANGO_SETTINGS_MODULE='mayan.settings.celery_redis' celery -A mayan worker -l DEBUG -Q checkouts,mailing,uploads,converter,ocr,tools,indexing,metadata -Ofair -B diff --git a/docs/topics/development.rst b/docs/topics/development.rst index fc35f6834d..5744691204 100644 --- a/docs/topics/development.rst +++ b/docs/topics/development.rst @@ -99,7 +99,7 @@ Then on a separate console launch a celery worker from the same provisioned Vagr $ vagrant ssh vagrant@vagrant-ubuntu-trusty-32:~$ cd ~/mayan-edms/ vagrant@vagrant-ubuntu-trusty-32:~$ source venv/bin/activate - vagrant@vagrant-ubuntu-trusty-32:~$ DJANGO_SETTINGS_MODULE='mayan.settings.celery_redis' celery -A mayan worker -l DEBUG -Q checkouts,mailing,uploads,converter,ocr,tools,indexing,metadata -Ofair + vagrant@vagrant-ubuntu-trusty-32:~$ DJANGO_SETTINGS_MODULE='mayan.settings.celery_redis' celery -A mayan worker -l DEBUG -Q checkouts,mailing,uploads,converter,ocr,tools,indexing,metadata -Ofair -B Contributing changes From a2b201127754ede6f47a47d7f8ff4c9db3822f2f Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 03:40:08 -0400 Subject: [PATCH 12/20] More migration dependencies check --- ...hfoldersource__chg_field_intervalbasemodel_document_type.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py b/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py index becfb1aced..30da4f66de 100644 --- a/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py +++ b/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py @@ -14,6 +14,9 @@ def fake_get_or_create(model, *args, **kwargs): class Migration(SchemaMigration): + needed_by = ( + ('documents', '0031_remove_orphan_documents'), + ) def forwards(self, orm): # Deleting model 'WatchFolderSource' From 172ef1e79a6653c02f7a02b3eccc4cf50452a50f Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 03:44:16 -0400 Subject: [PATCH 13/20] Make sure the ocr field of document type exists before we try to put data into it --- ...foldersource__chg_field_intervalbasemodel_document_type.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py b/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py index 30da4f66de..32d5d923d6 100644 --- a/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py +++ b/mayan/apps/sources/south_migrations/0009_auto__del_watchfoldersource__chg_field_intervalbasemodel_document_type.py @@ -14,6 +14,10 @@ def fake_get_or_create(model, *args, **kwargs): class Migration(SchemaMigration): + depends_on = ( + ('documents', '0024_auto__add_field_documenttype_ocr'), + ) + needed_by = ( ('documents', '0031_remove_orphan_documents'), ) From 007ac0f9780b31cbb231bbbdbd18196f62fc337b Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 15 Jan 2015 03:48:59 -0400 Subject: [PATCH 14/20] Add libreoffice to the Vagrant provissioning script --- contrib/scripts/install/development.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/contrib/scripts/install/development.sh b/contrib/scripts/install/development.sh index 01d71108af..403f3bbbeb 100644 --- a/contrib/scripts/install/development.sh +++ b/contrib/scripts/install/development.sh @@ -1,6 +1,6 @@ sudo apt-get update sudo apt-get -y upgrade -sudo apt-get -y install git-core python-virtualenv gcc python-dev libjpeg-dev libpng-dev libtiff-dev tesseract-ocr poppler-utils unpaper redis-server +sudo apt-get -y install git-core python-virtualenv gcc python-dev libjpeg-dev libpng-dev libtiff-dev tesseract-ocr poppler-utils unpaper redis-server libreoffice git clone /mayan-edms-repository/ /home/vagrant/mayan-edms cd /home/vagrant/mayan-edms git checkout development From bcc85d57147237bd97610a813dec053cdb06d105 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 16 Jan 2015 00:56:24 -0400 Subject: [PATCH 15/20] Remove unused mimetype views module --- mayan/apps/mimetype/views.py | 1 - 1 file changed, 1 deletion(-) delete mode 100644 mayan/apps/mimetype/views.py diff --git a/mayan/apps/mimetype/views.py b/mayan/apps/mimetype/views.py deleted file mode 100644 index 60f00ef0ef..0000000000 --- a/mayan/apps/mimetype/views.py +++ /dev/null @@ -1 +0,0 @@ -# Create your views here. From d7546c8fb710215a113f85fb62351b8279049b3d Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 16 Jan 2015 01:06:05 -0400 Subject: [PATCH 16/20] Remove unused code --- mayan/apps/mimetype/api.py | 79 -------------------------------------- 1 file changed, 79 deletions(-) diff --git a/mayan/apps/mimetype/api.py b/mayan/apps/mimetype/api.py index f594247602..a0ba8c6d2d 100644 --- a/mayan/apps/mimetype/api.py +++ b/mayan/apps/mimetype/api.py @@ -1,7 +1,5 @@ import os -from django.conf import settings - try: import magic USE_PYTHON_MAGIC = True @@ -11,83 +9,6 @@ except: USE_PYTHON_MAGIC = False -MIMETYPE_ICONS_DIRECTORY_NAME = os.path.join('images', 'mimetypes') - -UNKNWON_TYPE_FILE_NAME = 'unknown.png' -ERROR_FILE_NAME = 'error.png' - -mimetype_icons = { - 'application/pdf': 'file_extension_pdf.png', - 'application/zip': 'file_extension_zip.png', - 'application/ogg': 'file_extension_ogg.png', - 'application/postscript': 'file_extension_ps.png', - 'application/x-gzip': 'file_extension_gz.png', - 'application/x-rar-compressed': 'file_extension_rar.png', - 'application/x-troff-msvideo': 'file_extension_avi.png', - 'application/acad': 'file_extension_dwg.png', - 'application/octet-stream': 'file_extension_exe.png', - 'application/vnd.oasis.opendocument.text': 'ODF_textdocument_32x32.png', - 'application/vnd.oasis.opendocument.spreadsheet': 'ODF_spreadsheet_32x32.png', - 'application/vnd.oasis.opendocument.presentation': 'ODF_presentation_32x32.png', - 'application/vnd.oasis.opendocument.graphics': 'ODF_drawing_32x32.png', - 'application/vnd.ms-excel': 'file_extension_xls.png', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': 'file_extension_xls.png', - 'application/msword': 'file_extension_doc.png', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'file_extension_doc.png', - 'application/mspowerpoint': 'file_extension_pps.png', - 'application/vnd.ms-powerpoint': 'file_extension_pps.png', - 'application/wav': 'file_extension_wav.png', - 'application/x-wav': 'file_extension_wav.png', - 'application/vnd.oasis.opendocument.text': 'ODF_textdocument_32x32.png', - - 'image/jpeg': 'file_extension_jpeg.png', - 'image/png': 'file_extension_png.png', - 'image/x-png': 'file_extension_png.png', - 'image/tiff': 'file_extension_tif.png', - 'image/x-tiff': 'file_extension_tif.png', - 'image/bmp': 'file_extension_bmp.png', - 'image/gif': 'file_extension_gif.png', - 'image/vnd.dwg': 'file_extension_dwg.png', - 'image/x-dwg': 'file_extension_dwg.png', - - 'audio/mpeg': 'file_extension_mp3.png', - 'audio/mid': 'file_extension_mid.png', - 'audio/x-wav': 'file_extension_wav.png', - 'audio/vnd.wav': 'file_extension_wav.png', - 'audio/x-pn-realaudio': 'file_extension_ram.png', - 'audio/mp4': 'file_extension_mp4.png', - 'audio/x-ms-wma': 'file_extension_wma.png', - - 'video/avi': 'file_extension_avi.png', - 'video/mpeg': 'file_extension_mpeg.png', - 'video/quicktime': 'file_extension_mov.png', - 'video/x-ms-asf': 'file_extension_asf.png', - 'video/x-ms-wmv': 'file_extension_wmv.png', - - 'text/html': 'file_extension_html.png', - 'text/plain': 'file_extension_txt.png', -} - - -def get_icon_file_path(mimetype): - file_name = mimetype_icons.get(mimetype, UNKNWON_TYPE_FILE_NAME) - if settings.DEBUG: - return os.path.join(settings.BASE_DIR, 'apps', 'mimetype', 'static', MIMETYPE_ICONS_DIRECTORY_NAME, file_name) - else: - return os.path.join(settings.STATIC_ROOT, MIMETYPE_ICONS_DIRECTORY_NAME, file_name) - - -def get_error_icon_file_path(): - if settings.DEBUG: - return os.path.join(settings.BASE_DIR, 'apps', 'mimetype', 'static', MIMETYPE_ICONS_DIRECTORY_NAME, ERROR_FILE_NAME) - else: - return os.path.join(settings.STATIC_ROOT, MIMETYPE_ICONS_DIRECTORY_NAME, ERROR_FILE_NAME) - - -def get_error_icon_url(): - return os.path.join(MIMETYPE_ICONS_DIRECTORY_NAME, ERROR_FILE_NAME) - - def get_mimetype(file_description, filepath, mimetype_only=False): """ Determine a file's mimetype by calling the system's libmagic From 6c854116b54ba1252e48907997dc86c041c5383c Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 16 Jan 2015 02:06:24 -0400 Subject: [PATCH 17/20] Detect when a selected document or documents do not have any metadata, display a proper message and return user to the last previous view. Issue #144 --- mayan/apps/metadata/views.py | 38 ++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/mayan/apps/metadata/views.py b/mayan/apps/metadata/views.py index 4ad1c591be..7ac0722775 100644 --- a/mayan/apps/metadata/views.py +++ b/mayan/apps/metadata/views.py @@ -4,11 +4,12 @@ from django.conf import settings from django.contrib import messages from django.core.exceptions import PermissionDenied from django.core.urlresolvers import reverse -from django.http import HttpResponseRedirect -from django.shortcuts import get_object_or_404, render_to_response +from django.http import HttpResponseRedirect, Http404 +from django.shortcuts import (get_list_or_404, get_object_or_404, + render_to_response) from django.template import RequestContext from django.utils.http import urlencode -from django.utils.translation import ugettext_lazy as _ +from django.utils.translation import ugettext_lazy as _, ungettext from acls.models import AccessEntry from documents.models import Document, DocumentType @@ -36,15 +37,9 @@ from .permissions import (PERMISSION_METADATA_DOCUMENT_ADD, def metadata_edit(request, document_id=None, document_id_list=None): if document_id: - documents = [get_object_or_404(Document, pk=document_id)] - if documents[0].metadata.count() == 0: - messages.warning(request, _(u'The selected document doesn\'t have any metadata.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) - elif document_id_list: - documents = [get_object_or_404(Document.objects.select_related('document_type'), pk=document_id) for document_id in document_id_list.split(',')] - if len(set([document.document_type.pk for document in documents])) > 1: - messages.error(request, _(u'Only select documents of the same type.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + document_id_list = unicode(document_id) + + documents = Document.objects.select_related('metadata').filter(pk__in=document_id_list.split(',')) try: Permission.objects.check_permissions(request.user, [PERMISSION_METADATA_DOCUMENT_EDIT]) @@ -52,7 +47,23 @@ def metadata_edit(request, document_id=None, document_id_list=None): documents = AccessEntry.objects.filter_objects_by_access(PERMISSION_METADATA_DOCUMENT_EDIT, request.user, documents) if not documents: - messages.error(request, _(u'Must provide at least one document.')) + if document_id: + raise Http404 + else: + messages.error(request, _(u'Must provide at least one document.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + if len(set([document.document_type.pk for document in documents])) > 1: + messages.error(request, _(u'Only select documents of the same type.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + if set(documents.values_list('metadata__value' ,flat=True)) == set([None]): + message = ungettext( + u'The selected document doesn\'t have any metadata.', + u'The selected documents doesn\'t have any metadata.', + len(documents) + ) + messages.warning(request, message) return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) post_action_redirect = reverse('documents:document_list_recent') @@ -111,6 +122,7 @@ def metadata_edit(request, document_id=None, document_id_list=None): 'form': formset, 'next': next, } + if len(documents) == 1: context['object'] = documents[0] context['title'] = _(u'Edit metadata for document: %s') % ', '.join([unicode(d) for d in documents]) From 72a0c2dd95b9b5447770fa67850e6db0e2b2c861 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 16 Jan 2015 02:08:15 -0400 Subject: [PATCH 18/20] Properly pluralize document metadata edit view --- mayan/apps/metadata/views.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/mayan/apps/metadata/views.py b/mayan/apps/metadata/views.py index 7ac0722775..bbccad0e07 100644 --- a/mayan/apps/metadata/views.py +++ b/mayan/apps/metadata/views.py @@ -125,9 +125,15 @@ def metadata_edit(request, document_id=None, document_id_list=None): if len(documents) == 1: context['object'] = documents[0] - context['title'] = _(u'Edit metadata for document: %s') % ', '.join([unicode(d) for d in documents]) - elif len(documents) > 1: - context['title'] = _(u'Edit metadata for documents: %s') % ', '.join([unicode(d) for d in documents]) + + context['title'] = ungettext( + u'Edit metadata for document: %(document)s', + u'Edit metadata for the %(count)d selected documents', + len(documents) + ) % { + u'count': len(documents), + u'document': documents[0], + } return render_to_response('main/generic_form.html', context, context_instance=RequestContext(request)) From 95dd017b23b9f5bad302451800b0676d2257d308 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 16 Jan 2015 02:12:26 -0400 Subject: [PATCH 19/20] Extend issue #144 solution to the document metadata remove view too. --- mayan/apps/metadata/views.py | 46 ++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 15 deletions(-) diff --git a/mayan/apps/metadata/views.py b/mayan/apps/metadata/views.py index bbccad0e07..4d0d280452 100644 --- a/mayan/apps/metadata/views.py +++ b/mayan/apps/metadata/views.py @@ -223,24 +223,33 @@ def metadata_multiple_add(request): def metadata_remove(request, document_id=None, document_id_list=None): if document_id: - documents = [get_object_or_404(Document, pk=document_id)] - if documents[0].metadata.count() == 0: - messages.warning(request, _(u'The selected document doesn\'t have any metadata.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + document_id_list = unicode(document_id) - elif document_id_list: - documents = [get_object_or_404(Document.objects.select_related('document_type'), pk=document_id) for document_id in document_id_list.split(',')] - if len(set([document.document_type.pk for document in documents])) > 1: - messages.error(request, _(u'Only select documents of the same type.')) - return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + documents = Document.objects.select_related('metadata').filter(pk__in=document_id_list.split(',')) try: - Permission.objects.check_permissions(request.user, [PERMISSION_METADATA_DOCUMENT_REMOVE]) + Permission.objects.check_permissions(request.user, [PERMISSION_METADATA_DOCUMENT_EDIT]) except PermissionDenied: - documents = AccessEntry.objects.filter_objects_by_access(PERMISSION_METADATA_DOCUMENT_REMOVE, request.user, documents) + documents = AccessEntry.objects.filter_objects_by_access(PERMISSION_METADATA_DOCUMENT_EDIT, request.user, documents) if not documents: - messages.error(request, _(u'Must provide at least one document.')) + if document_id: + raise Http404 + else: + messages.error(request, _(u'Must provide at least one document.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + if len(set([document.document_type.pk for document in documents])) > 1: + messages.error(request, _(u'Only select documents of the same type.')) + return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) + + if set(documents.values_list('metadata__value' ,flat=True)) == set([None]): + message = ungettext( + u'The selected document doesn\'t have any metadata.', + u'The selected documents doesn\'t have any metadata.', + len(documents) + ) + messages.warning(request, message) return HttpResponseRedirect(request.META.get('HTTP_REFERER', reverse('main:home'))) post_action_redirect = reverse('documents:document_list_recent') @@ -291,11 +300,18 @@ def metadata_remove(request, document_id=None, document_id_list=None): 'form': formset, 'next': next, } + if len(documents) == 1: context['object'] = documents[0] - context['title'] = _(u'Remove metadata types from document: %s') % ', '.join([unicode(d) for d in documents]) - elif len(documents) > 1: - context['title'] = _(u'Remove metadata types from documents: %s') % ', '.join([unicode(d) for d in documents]) + + context['title'] = ungettext( + u'Remove metadata types from document: %(document)s', + u'Remove metadata types from the %(count)d selected documents', + len(documents) + ) % { + u'count': len(documents), + u'document': documents[0], + } return render_to_response('main/generic_form.html', context, context_instance=RequestContext(request)) From a2bcc38b08962ae0f7ffb61c4524c6888d12c2f4 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Fri, 16 Jan 2015 02:14:35 -0400 Subject: [PATCH 20/20] Properly pluralize the messages of the document metadata type add view --- mayan/apps/metadata/views.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/mayan/apps/metadata/views.py b/mayan/apps/metadata/views.py index 4d0d280452..9c8444efc0 100644 --- a/mayan/apps/metadata/views.py +++ b/mayan/apps/metadata/views.py @@ -207,11 +207,18 @@ def metadata_add(request, document_id=None, document_id_list=None): 'form': form, 'next': next, } + if len(documents) == 1: context['object'] = documents[0] - context['title'] = _(u'Add metadata type to document: %s') % ', '.join([unicode(d) for d in documents]) - elif len(documents) > 1: - context['title'] = _(u'Add metadata type to documents: %s') % ', '.join([unicode(d) for d in documents]) + + context['title'] = ungettext( + u'Add metadata types to document: %(document)s', + u'Add metadata types to the %(count)d selected documents', + len(documents) + ) % { + u'count': len(documents), + u'document': documents[0], + } return render_to_response('main/generic_form.html', context, context_instance=RequestContext(request))