diff --git a/docs/releases/2.5.rst b/docs/releases/2.5.rst index e146137e13..31e8fb9197 100644 --- a/docs/releases/2.5.rst +++ b/docs/releases/2.5.rst @@ -54,6 +54,14 @@ Other Changes GitLab issue #373. - Add support to search documents by their checksums. - The document checksum field is now indexed for faster searches by checksum. +- Add support for duplicated document scanning. Every time a document is + uploaded, a document scan will be triggered to determine if the new document + is a duplicate of an existing document. Duplicate documents will be listed + in a new "Duplicated documents" link in the main menu. A full document list + scan can also be triggered by using the new "Duplicated document scan" button + in the tools menu. Finally a new tab in the document view has been added + called "Duplicates" that will list all duplicates of the currently + selected document. Removals -------- diff --git a/mayan/apps/documents/admin.py b/mayan/apps/documents/admin.py index d4f6671d9e..62d30c0e1f 100644 --- a/mayan/apps/documents/admin.py +++ b/mayan/apps/documents/admin.py @@ -4,7 +4,7 @@ from django.contrib import admin from .models import ( DeletedDocument, Document, DocumentPage, DocumentType, - DocumentTypeFilename, DocumentVersion, RecentDocument + DocumentTypeFilename, DocumentVersion, DuplicatedDocument, RecentDocument ) @@ -55,6 +55,13 @@ class DocumentTypeAdmin(admin.ModelAdmin): ) +@admin.register(DuplicatedDocument) +class DuplicatedDocumentAdmin(admin.ModelAdmin): + list_display = ( + 'document', 'datetime_added' + ) + + @admin.register(RecentDocument) class RecentDocumentAdmin(admin.ModelAdmin): date_hierarchy = 'datetime_accessed' diff --git a/mayan/apps/documents/apps.py b/mayan/apps/documents/apps.py index dc2aa30ffb..22f8f36e13 100644 --- a/mayan/apps/documents/apps.py +++ b/mayan/apps/documents/apps.py @@ -31,11 +31,13 @@ from rest_api.classes import APIEndPoint from rest_api.fields import DynamicSerializerField from statistics.classes import StatisticNamespace, CharJSLine -from .handlers import create_default_document_type +from .handlers import ( + create_default_document_type, handler_scan_duplicates_for +) from .links import ( link_clear_image_cache, link_document_clear_transformations, link_document_clone_transformations, link_document_delete, - link_document_document_type_edit, + link_document_document_type_edit, link_document_duplicates_list, link_document_multiple_document_type_edit, link_document_download, link_document_edit, link_document_list, link_document_list_deleted, link_document_list_recent, link_document_multiple_delete, @@ -55,7 +57,8 @@ from .links import ( link_document_type_filename_list, link_document_type_list, link_document_type_setup, link_document_update_page_count, link_document_version_download, link_document_version_list, - link_document_version_revert, link_trash_can_empty + link_document_version_revert, link_duplicated_document_list, + link_duplicated_document_scan, link_trash_can_empty ) from .literals import ( CHECK_DELETE_PERIOD_INTERVAL, CHECK_TRASH_PERIOD_INTERVAL, @@ -73,6 +76,7 @@ from .permissions import ( from .queues import * # NOQA # Just import to initialize the search models from .search import document_search, document_page_search # NOQA +from .signals import post_version_upload from .statistics import ( new_documents_per_month, new_document_pages_per_month, new_document_pages_this_month, new_documents_this_month, @@ -100,6 +104,7 @@ class DocumentsApp(MayanAppConfig): DocumentType = self.get_model('DocumentType') DocumentTypeFilename = self.get_model('DocumentTypeFilename') DocumentVersion = self.get_model('DocumentVersion') + DuplicatedDocument = self.get_model('DuplicatedDocument') DynamicSerializerField.add_serializer( klass=Document, @@ -271,6 +276,16 @@ class DocumentsApp(MayanAppConfig): source=DocumentVersion, label=_('Comment'), attribute='comment' ) + SourceColumn( + source=DuplicatedDocument, label=_('Thumbnail'), + func=lambda context: document_thumbnail_widget.render( + instance=context['object'].document + ) + ) + SourceColumn( + source=DuplicatedDocument, label=_('Duplicates'), + func=lambda context: context['object'].documents.count() + ) app.conf.CELERYBEAT_SCHEDULE.update( { @@ -328,20 +343,28 @@ class DocumentsApp(MayanAppConfig): 'documents.tasks.task_upload_new_version': { 'queue': 'uploads' }, + 'documents.tasks.task_scan_duplicates_all': { + 'queue': 'tools' + }, + 'documents.tasks.task_scan_duplicates_for': { + 'queue': 'uploads' + }, } ) menu_documents.bind_links( links=( link_document_list_recent, link_document_list, - link_document_list_deleted + link_document_list_deleted, link_duplicated_document_list ) ) menu_main.bind_links(links=(menu_documents,), position=0) menu_setup.bind_links(links=(link_document_type_setup,)) - menu_tools.bind_links(links=(link_clear_image_cache,)) + menu_tools.bind_links( + links=(link_clear_image_cache, link_duplicated_document_scan) + ) # Document type links menu_object.bind_links( @@ -384,7 +407,7 @@ class DocumentsApp(MayanAppConfig): link_document_print, link_document_trash, link_document_download, link_document_clear_transformations, link_document_clone_transformations, - link_document_update_page_count + link_document_update_page_count, ), sources=(Document,) ) menu_object.bind_links( @@ -393,7 +416,10 @@ class DocumentsApp(MayanAppConfig): ) # Document facet links - menu_facet.bind_links(links=(link_acl_list,), sources=(Document,)) + menu_facet.bind_links( + links=(link_document_duplicates_list, link_acl_list,), + sources=(Document,) + ) menu_facet.bind_links( links=(link_document_preview,), sources=(Document,), position=0 ) @@ -499,6 +525,10 @@ class DocumentsApp(MayanAppConfig): create_default_document_type, dispatch_uid='create_default_document_type' ) + post_version_upload.connect( + handler_scan_duplicates_for, + dispatch_uid='handler_scan_duplicates_for', + ) registry.register(DeletedDocument) registry.register(Document) diff --git a/mayan/apps/documents/handlers.py b/mayan/apps/documents/handlers.py index d4c82f861b..c0137f20fd 100644 --- a/mayan/apps/documents/handlers.py +++ b/mayan/apps/documents/handlers.py @@ -4,6 +4,7 @@ from django.apps import apps from .literals import DEFAULT_DOCUMENT_TYPE_LABEL from .signals import post_initial_document_type +from .tasks import task_scan_duplicates_for def create_default_document_type(sender, **kwargs): @@ -18,3 +19,9 @@ def create_default_document_type(sender, **kwargs): post_initial_document_type.send( sender=DocumentType, instance=document_type ) + + +def handler_scan_duplicates_for(sender, instance, **kwargs): + task_scan_duplicates_for.apply_async( + kwargs={'document_id': instance.document.pk} + ) diff --git a/mayan/apps/documents/links.py b/mayan/apps/documents/links.py index 351a1b04ad..1bd78996e2 100644 --- a/mayan/apps/documents/links.py +++ b/mayan/apps/documents/links.py @@ -284,3 +284,16 @@ link_document_type_setup = Link( icon='fa fa-file', permissions=(permission_document_type_view,), text=_('Document types'), view='documents:document_type_list' ) +link_duplicated_document_list = Link( + icon='fa fa-clone', text=_('Duplicated documents'), + view='documents:duplicated_document_list' +) +link_document_duplicates_list = Link( + args='resolved_object.id', icon='fa fa-clone', + permissions=(permission_document_view,), text=_('Duplicates'), + view='documents:document_duplicates_list', +) +link_duplicated_document_scan = Link( + icon='fa fa-clone', text=_('Duplicated document scan'), + view='documents:duplicated_document_scan' +) diff --git a/mayan/apps/documents/managers.py b/mayan/apps/documents/managers.py index 2ca7084578..db4d6d7c0f 100644 --- a/mayan/apps/documents/managers.py +++ b/mayan/apps/documents/managers.py @@ -5,6 +5,7 @@ import logging from django.apps import apps from django.db import models +from django.db.models import F, Max from django.utils.timezone import now from .literals import STUB_EXPIRATION_INTERVAL @@ -98,6 +99,45 @@ class DocumentTypeManager(models.Manager): return self.get(label=label) +class DuplicatedDocumentManager(models.Manager): + def scan(self): + """ + Find duplicates by iterating over all documents and then + find matching latest version checksums + """ + Document = apps.get_model( + app_label='documents', model_name='Document' + ) + + for document in Document.objects.all(): + self.scan_for(document=document, scan_children=False) + + def scan_for(self, document, scan_children=True): + """ + Find duplicates by matching latest version checksums + """ + Document = apps.get_model( + app_label='documents', model_name='Document' + ) + + # Get the documents whose latest version matches the checksum + # of the current document and exclude the current document + duplicates = Document.objects.annotate( + max_timestamp=Max('versions__timestamp') + ).filter( + versions__timestamp=F('max_timestamp'), + versions__checksum=document.checksum + ).exclude(pk=document.pk) + + if duplicates.exists(): + instance, created = self.get_or_create(document=document) + instance.documents.add(*duplicates) + + if scan_children: + for document in duplicates: + self.scan_for(document=document, scan_children=False) + + class PassthroughManager(models.Manager): pass diff --git a/mayan/apps/documents/migrations/0039_duplicateddocument.py b/mayan/apps/documents/migrations/0039_duplicateddocument.py new file mode 100644 index 0000000000..a7d944837f --- /dev/null +++ b/mayan/apps/documents/migrations/0039_duplicateddocument.py @@ -0,0 +1,47 @@ +# -*- coding: utf-8 -*- +# Generated by Django 1.10.7 on 2017-07-06 03:30 +from __future__ import unicode_literals + +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('documents', '0038_auto_20170705_2008'), + ] + + operations = [ + migrations.CreateModel( + name='DuplicatedDocument', + fields=[ + ( + 'id', models.AutoField( + auto_created=True, primary_key=True, serialize=False, + verbose_name='ID') + ), + ( + 'datetime_added', models.DateTimeField( + auto_now_add=True, db_index=True, + verbose_name='Added') + ), + ( + 'document', models.ForeignKey( + on_delete=django.db.models.deletion.CASCADE, + related_name='duplicates', to='documents.Document', + verbose_name='Document') + ), + ( + 'documents', models.ManyToManyField( + to='documents.Document', + verbose_name='Duplicated documents' + ) + ), + ], + options={ + 'verbose_name': 'Duplicated document', + 'verbose_name_plural': 'Duplicated documents', + }, + ), + ] diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py index 3b2304a561..15b2a0fbeb 100644 --- a/mayan/apps/documents/models.py +++ b/mayan/apps/documents/models.py @@ -30,8 +30,8 @@ from .events import ( ) from .literals import DEFAULT_DELETE_PERIOD, DEFAULT_DELETE_TIME_UNIT from .managers import ( - DocumentManager, DocumentTypeManager, PassthroughManager, - RecentDocumentManager, TrashCanManager + DocumentManager, DocumentTypeManager, DuplicatedDocumentManager, + PassthroughManager, RecentDocumentManager, TrashCanManager ) from .permissions import permission_document_view from .runtime import cache_storage_backend, storage_backend @@ -892,3 +892,25 @@ class RecentDocument(models.Model): ordering = ('-datetime_accessed',) verbose_name = _('Recent document') verbose_name_plural = _('Recent documents') + + +@python_2_unicode_compatible +class DuplicatedDocument(models.Model): + document = models.ForeignKey( + Document, related_name='duplicates', verbose_name=_('Document') + ) + documents = models.ManyToManyField( + Document, verbose_name=_('Duplicated documents') + ) + datetime_added = models.DateTimeField( + auto_now_add=True, db_index=True, verbose_name=_('Added') + ) + + objects = DuplicatedDocumentManager() + + def __str__(self): + return force_text(self.document) + + class Meta: + verbose_name = _('Duplicated document') + verbose_name_plural = _('Duplicated documents') diff --git a/mayan/apps/documents/tasks.py b/mayan/apps/documents/tasks.py index 873ee89df1..1d9690cb05 100644 --- a/mayan/apps/documents/tasks.py +++ b/mayan/apps/documents/tasks.py @@ -66,6 +66,29 @@ def task_generate_document_page_image(document_page_id, *args, **kwargs): return document_page.generate_image(*args, **kwargs) +@app.task(ignore_result=True) +def task_scan_duplicates_all(): + DuplicatedDocument = apps.get_model( + app_label='documents', model_name='DuplicatedDocument' + ) + + DuplicatedDocument.objects.scan() + + +@app.task(ignore_result=True) +def task_scan_duplicates_for(document_id): + Document = apps.get_model( + app_label='documents', model_name='Document' + ) + DuplicatedDocument = apps.get_model( + app_label='documents', model_name='DuplicatedDocument' + ) + + document = Document.objects.get(pk=document_id) + + DuplicatedDocument.objects.scan_for(document=document) + + @app.task(bind=True, default_retry_delay=UPDATE_PAGE_COUNT_RETRY_DELAY, ignore_result=True) def task_update_page_count(self, version_id): DocumentVersion = apps.get_model( diff --git a/mayan/apps/documents/tests/test_views.py b/mayan/apps/documents/tests/test_views.py index 4563612a77..e9029fab1e 100644 --- a/mayan/apps/documents/tests/test_views.py +++ b/mayan/apps/documents/tests/test_views.py @@ -735,3 +735,55 @@ class DeletedDocumentTestCase(GenericDocumentViewTestCase): response = self.get('documents:document_list_deleted') self.assertContains(response, self.document.label, status_code=200) + + +class DuplicatedDocumentsViewsTestCase(GenericDocumentViewTestCase): + def setUp(self): + super(DuplicatedDocumentsViewsTestCase, self).setUp() + self.login_user() + + def _upload_duplicate_document(self): + with open(TEST_SMALL_DOCUMENT_PATH) as file_object: + self.document_duplicate = self.document_type.new_document( + file_object=file_object, label=TEST_SMALL_DOCUMENT_FILENAME + ) + + def _request_duplicated_document_list(self): + return self.get('documents:duplicated_document_list') + + def _request_document_duplicates_list(self): + return self.get( + 'documents:document_duplicates_list', args=(self.document.pk,) + ) + + def test_duplicated_document_list_no_permissions(self): + self._upload_duplicate_document() + response = self._request_duplicated_document_list() + + self.assertNotContains( + response, text=self.document.label, status_code=200 + ) + + def test_duplicated_document_list_with_permissions(self): + self._upload_duplicate_document() + self.grant(permission=permission_document_view) + response = self._request_duplicated_document_list() + + self.assertContains( + response, text=self.document.label, status_code=200 + ) + + def test_document_duplicates_list_no_permissions(self): + self._upload_duplicate_document() + response = self._request_document_duplicates_list() + + self.assertEqual(response.status_code, 403) + + def test_document_duplicates_list_with_permissions(self): + self._upload_duplicate_document() + self.grant(permission=permission_document_view) + response = self._request_document_duplicates_list() + + self.assertContains( + response, text=self.document.label, status_code=200 + ) diff --git a/mayan/apps/documents/urls.py b/mayan/apps/documents/urls.py index c81fb12461..e5c2250b3d 100644 --- a/mayan/apps/documents/urls.py +++ b/mayan/apps/documents/urls.py @@ -16,8 +16,8 @@ from .views import ( ClearImageCacheView, DeletedDocumentDeleteView, DeletedDocumentDeleteManyView, DeletedDocumentListView, DocumentDocumentTypeEditView, DocumentDownloadFormView, - DocumentDownloadView, DocumentEditView, DocumentListView, - DocumentPageListView, DocumentPageNavigationFirst, + DocumentDownloadView, DocumentDuplicatesListView, DocumentEditView, + DocumentListView, DocumentPageListView, DocumentPageNavigationFirst, DocumentPageNavigationLast, DocumentPageNavigationNext, DocumentPageNavigationPrevious, DocumentPageRotateLeftView, DocumentPageRotateRightView, DocumentPageView, DocumentPageViewResetView, @@ -31,7 +31,8 @@ from .views import ( DocumentTypeListView, DocumentTypeEditView, DocumentUpdatePageCountView, DocumentVersionDownloadFormView, DocumentVersionDownloadView, DocumentVersionListView, DocumentVersionRevertView, DocumentView, - EmptyTrashCanView, RecentDocumentListView + DuplicatedDocumentListView, EmptyTrashCanView, RecentDocumentListView, + ScanDuplicatedDocuments ) @@ -45,7 +46,11 @@ urlpatterns = [ r'^list/deleted/$', DeletedDocumentListView.as_view(), name='document_list_deleted' ), - + url( + r'^list/duplicated/$', + DuplicatedDocumentListView.as_view(), + name='duplicated_document_list' + ), url( r'^(?P\d+)/preview/$', DocumentPreviewView.as_view(), name='document_preview' @@ -54,6 +59,10 @@ urlpatterns = [ r'^(?P\d+)/properties/$', DocumentView.as_view(), name='document_properties' ), + url( + r'^(?P\d+)/duplicates/$', DocumentDuplicatesListView.as_view(), + name='document_duplicates_list' + ), url( r'^(?P\d+)/restore/$', DocumentRestoreView.as_view(), name='document_restore' @@ -255,6 +264,14 @@ urlpatterns = [ DocumentTypeFilenameCreateView.as_view(), name='document_type_filename_create' ), + + # Tools + + url( + r'^tools/documents/duplicated/scan/$', + ScanDuplicatedDocuments.as_view(), + name='duplicated_document_scan' + ), ] api_urls = [ diff --git a/mayan/apps/documents/views/document_views.py b/mayan/apps/documents/views/document_views.py index be3bb84833..c641b1017b 100644 --- a/mayan/apps/documents/views/document_views.py +++ b/mayan/apps/documents/views/document_views.py @@ -19,6 +19,7 @@ from common.generics import ( SingleObjectDownloadView, SingleObjectEditView, SingleObjectListView ) from common.mixins import MultipleInstanceActionMixin +from common.utils import encapsulate from converter.models import Transformation from converter.permissions import ( permission_transformation_delete, permission_transformation_edit @@ -31,7 +32,9 @@ from ..forms import ( DocumentTypeSelectForm, ) from ..literals import PAGE_RANGE_RANGE, DEFAULT_ZIP_FILENAME -from ..models import DeletedDocument, Document, RecentDocument +from ..models import ( + DeletedDocument, Document, DuplicatedDocument, RecentDocument +) from ..permissions import ( permission_document_delete, permission_document_download, permission_document_print, permission_document_properties_edit, @@ -167,6 +170,36 @@ class DocumentDocumentTypeEditView(MultipleObjectFormActionView): ) +class DocumentDuplicatesListView(DocumentListView): + def dispatch(self, request, *args, **kwargs): + AccessControlList.objects.check_access( + permissions=permission_document_view, user=self.request.user, + obj=self.get_document() + ) + + return super( + DocumentDuplicatesListView, self + ).dispatch(request, *args, **kwargs) + + def get_document(self): + return get_object_or_404(Document, pk=self.kwargs['pk']) + + def get_queryset(self): + try: + return DuplicatedDocument.objects.get( + document=self.get_document() + ).documents.all() + except DuplicatedDocument.DoesNotExist: + return Document.objects.none() + + def get_extra_context(self): + return { + 'hide_links': True, + 'object': self.get_document(), + 'title': _('Duplicates for document: %s') % self.get_document(), + } + + class DocumentEditView(SingleObjectEditView): form_class = DocumentForm model = Document @@ -722,3 +755,27 @@ class DocumentPrint(FormView): return ['documents/document_print.html'] else: return [self.template_name] + + +class DuplicatedDocumentListView(DocumentListView): + extra_context = { + 'extra_columns': ( + { + 'name': _('Duplicates'), + 'attribute': encapsulate( + lambda document: DuplicatedDocument.objects.get( + document=document + ).documents.count() + ) + }, + ), + 'hide_links': True, + 'title': _('Duplicated documents') + } + + def get_document_queryset(self): + return Document.objects.filter( + pk__in=DuplicatedDocument.objects.values_list( + 'document_id', flat=True + ) + ) diff --git a/mayan/apps/documents/views/misc_views.py b/mayan/apps/documents/views/misc_views.py index 04a0facd2b..741acd5677 100644 --- a/mayan/apps/documents/views/misc_views.py +++ b/mayan/apps/documents/views/misc_views.py @@ -8,7 +8,7 @@ from django.utils.translation import ugettext_lazy as _ from common.generics import ConfirmView from ..permissions import permission_document_tools -from ..tasks import task_clear_image_cache +from ..tasks import task_clear_image_cache, task_scan_duplicates_all logger = logging.getLogger(__name__) @@ -24,3 +24,16 @@ class ClearImageCacheView(ConfirmView): messages.success( self.request, _('Document cache clearing queued successfully.') ) + + +class ScanDuplicatedDocuments(ConfirmView): + extra_context = { + 'title': _('Scan for duplicated documents?') + } + view_permission = permission_document_tools + + def view_action(self): + task_scan_duplicates_all.apply_async() + messages.success( + self.request, _('Duplicated document scan queued successfully.') + )