Add duplicated document scan support.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-07-06 02:56:35 -04:00
parent 55eedc153e
commit d4e1a506ed
13 changed files with 352 additions and 16 deletions

View File

@@ -54,6 +54,14 @@ Other Changes
GitLab issue #373. GitLab issue #373.
- Add support to search documents by their checksums. - Add support to search documents by their checksums.
- The document checksum field is now indexed for faster searches by checksum. - The document checksum field is now indexed for faster searches by checksum.
- Add support for duplicated document scanning. Every time a document is
uploaded, a document scan will be triggered to determine if the new document
is a duplicate of an existing document. Duplicate documents will be listed
in a new "Duplicated documents" link in the main menu. A full document list
scan can also be triggered by using the new "Duplicated document scan" button
in the tools menu. Finally a new tab in the document view has been added
called "Duplicates" that will list all duplicates of the currently
selected document.
Removals Removals
-------- --------

View File

@@ -4,7 +4,7 @@ from django.contrib import admin
from .models import ( from .models import (
DeletedDocument, Document, DocumentPage, DocumentType, DeletedDocument, Document, DocumentPage, DocumentType,
DocumentTypeFilename, DocumentVersion, RecentDocument DocumentTypeFilename, DocumentVersion, DuplicatedDocument, RecentDocument
) )
@@ -55,6 +55,13 @@ class DocumentTypeAdmin(admin.ModelAdmin):
) )
@admin.register(DuplicatedDocument)
class DuplicatedDocumentAdmin(admin.ModelAdmin):
list_display = (
'document', 'datetime_added'
)
@admin.register(RecentDocument) @admin.register(RecentDocument)
class RecentDocumentAdmin(admin.ModelAdmin): class RecentDocumentAdmin(admin.ModelAdmin):
date_hierarchy = 'datetime_accessed' date_hierarchy = 'datetime_accessed'

View File

@@ -31,11 +31,13 @@ from rest_api.classes import APIEndPoint
from rest_api.fields import DynamicSerializerField from rest_api.fields import DynamicSerializerField
from statistics.classes import StatisticNamespace, CharJSLine from statistics.classes import StatisticNamespace, CharJSLine
from .handlers import create_default_document_type from .handlers import (
create_default_document_type, handler_scan_duplicates_for
)
from .links import ( from .links import (
link_clear_image_cache, link_document_clear_transformations, link_clear_image_cache, link_document_clear_transformations,
link_document_clone_transformations, link_document_delete, link_document_clone_transformations, link_document_delete,
link_document_document_type_edit, link_document_document_type_edit, link_document_duplicates_list,
link_document_multiple_document_type_edit, link_document_download, link_document_multiple_document_type_edit, link_document_download,
link_document_edit, link_document_list, link_document_list_deleted, link_document_edit, link_document_list, link_document_list_deleted,
link_document_list_recent, link_document_multiple_delete, link_document_list_recent, link_document_multiple_delete,
@@ -55,7 +57,8 @@ from .links import (
link_document_type_filename_list, link_document_type_list, link_document_type_filename_list, link_document_type_list,
link_document_type_setup, link_document_update_page_count, link_document_type_setup, link_document_update_page_count,
link_document_version_download, link_document_version_list, link_document_version_download, link_document_version_list,
link_document_version_revert, link_trash_can_empty link_document_version_revert, link_duplicated_document_list,
link_duplicated_document_scan, link_trash_can_empty
) )
from .literals import ( from .literals import (
CHECK_DELETE_PERIOD_INTERVAL, CHECK_TRASH_PERIOD_INTERVAL, CHECK_DELETE_PERIOD_INTERVAL, CHECK_TRASH_PERIOD_INTERVAL,
@@ -73,6 +76,7 @@ from .permissions import (
from .queues import * # NOQA from .queues import * # NOQA
# Just import to initialize the search models # Just import to initialize the search models
from .search import document_search, document_page_search # NOQA from .search import document_search, document_page_search # NOQA
from .signals import post_version_upload
from .statistics import ( from .statistics import (
new_documents_per_month, new_document_pages_per_month, new_documents_per_month, new_document_pages_per_month,
new_document_pages_this_month, new_documents_this_month, new_document_pages_this_month, new_documents_this_month,
@@ -100,6 +104,7 @@ class DocumentsApp(MayanAppConfig):
DocumentType = self.get_model('DocumentType') DocumentType = self.get_model('DocumentType')
DocumentTypeFilename = self.get_model('DocumentTypeFilename') DocumentTypeFilename = self.get_model('DocumentTypeFilename')
DocumentVersion = self.get_model('DocumentVersion') DocumentVersion = self.get_model('DocumentVersion')
DuplicatedDocument = self.get_model('DuplicatedDocument')
DynamicSerializerField.add_serializer( DynamicSerializerField.add_serializer(
klass=Document, klass=Document,
@@ -271,6 +276,16 @@ class DocumentsApp(MayanAppConfig):
source=DocumentVersion, label=_('Comment'), source=DocumentVersion, label=_('Comment'),
attribute='comment' attribute='comment'
) )
SourceColumn(
source=DuplicatedDocument, label=_('Thumbnail'),
func=lambda context: document_thumbnail_widget.render(
instance=context['object'].document
)
)
SourceColumn(
source=DuplicatedDocument, label=_('Duplicates'),
func=lambda context: context['object'].documents.count()
)
app.conf.CELERYBEAT_SCHEDULE.update( app.conf.CELERYBEAT_SCHEDULE.update(
{ {
@@ -328,20 +343,28 @@ class DocumentsApp(MayanAppConfig):
'documents.tasks.task_upload_new_version': { 'documents.tasks.task_upload_new_version': {
'queue': 'uploads' 'queue': 'uploads'
}, },
'documents.tasks.task_scan_duplicates_all': {
'queue': 'tools'
},
'documents.tasks.task_scan_duplicates_for': {
'queue': 'uploads'
},
} }
) )
menu_documents.bind_links( menu_documents.bind_links(
links=( links=(
link_document_list_recent, link_document_list, link_document_list_recent, link_document_list,
link_document_list_deleted link_document_list_deleted, link_duplicated_document_list
) )
) )
menu_main.bind_links(links=(menu_documents,), position=0) menu_main.bind_links(links=(menu_documents,), position=0)
menu_setup.bind_links(links=(link_document_type_setup,)) menu_setup.bind_links(links=(link_document_type_setup,))
menu_tools.bind_links(links=(link_clear_image_cache,)) menu_tools.bind_links(
links=(link_clear_image_cache, link_duplicated_document_scan)
)
# Document type links # Document type links
menu_object.bind_links( menu_object.bind_links(
@@ -384,7 +407,7 @@ class DocumentsApp(MayanAppConfig):
link_document_print, link_document_trash, link_document_print, link_document_trash,
link_document_download, link_document_clear_transformations, link_document_download, link_document_clear_transformations,
link_document_clone_transformations, link_document_clone_transformations,
link_document_update_page_count link_document_update_page_count,
), sources=(Document,) ), sources=(Document,)
) )
menu_object.bind_links( menu_object.bind_links(
@@ -393,7 +416,10 @@ class DocumentsApp(MayanAppConfig):
) )
# Document facet links # Document facet links
menu_facet.bind_links(links=(link_acl_list,), sources=(Document,)) menu_facet.bind_links(
links=(link_document_duplicates_list, link_acl_list,),
sources=(Document,)
)
menu_facet.bind_links( menu_facet.bind_links(
links=(link_document_preview,), sources=(Document,), position=0 links=(link_document_preview,), sources=(Document,), position=0
) )
@@ -499,6 +525,10 @@ class DocumentsApp(MayanAppConfig):
create_default_document_type, create_default_document_type,
dispatch_uid='create_default_document_type' dispatch_uid='create_default_document_type'
) )
post_version_upload.connect(
handler_scan_duplicates_for,
dispatch_uid='handler_scan_duplicates_for',
)
registry.register(DeletedDocument) registry.register(DeletedDocument)
registry.register(Document) registry.register(Document)

View File

@@ -4,6 +4,7 @@ from django.apps import apps
from .literals import DEFAULT_DOCUMENT_TYPE_LABEL from .literals import DEFAULT_DOCUMENT_TYPE_LABEL
from .signals import post_initial_document_type from .signals import post_initial_document_type
from .tasks import task_scan_duplicates_for
def create_default_document_type(sender, **kwargs): def create_default_document_type(sender, **kwargs):
@@ -18,3 +19,9 @@ def create_default_document_type(sender, **kwargs):
post_initial_document_type.send( post_initial_document_type.send(
sender=DocumentType, instance=document_type sender=DocumentType, instance=document_type
) )
def handler_scan_duplicates_for(sender, instance, **kwargs):
task_scan_duplicates_for.apply_async(
kwargs={'document_id': instance.document.pk}
)

View File

@@ -284,3 +284,16 @@ link_document_type_setup = Link(
icon='fa fa-file', permissions=(permission_document_type_view,), icon='fa fa-file', permissions=(permission_document_type_view,),
text=_('Document types'), view='documents:document_type_list' text=_('Document types'), view='documents:document_type_list'
) )
link_duplicated_document_list = Link(
icon='fa fa-clone', text=_('Duplicated documents'),
view='documents:duplicated_document_list'
)
link_document_duplicates_list = Link(
args='resolved_object.id', icon='fa fa-clone',
permissions=(permission_document_view,), text=_('Duplicates'),
view='documents:document_duplicates_list',
)
link_duplicated_document_scan = Link(
icon='fa fa-clone', text=_('Duplicated document scan'),
view='documents:duplicated_document_scan'
)

View File

@@ -5,6 +5,7 @@ import logging
from django.apps import apps from django.apps import apps
from django.db import models from django.db import models
from django.db.models import F, Max
from django.utils.timezone import now from django.utils.timezone import now
from .literals import STUB_EXPIRATION_INTERVAL from .literals import STUB_EXPIRATION_INTERVAL
@@ -98,6 +99,45 @@ class DocumentTypeManager(models.Manager):
return self.get(label=label) return self.get(label=label)
class DuplicatedDocumentManager(models.Manager):
def scan(self):
"""
Find duplicates by iterating over all documents and then
find matching latest version checksums
"""
Document = apps.get_model(
app_label='documents', model_name='Document'
)
for document in Document.objects.all():
self.scan_for(document=document, scan_children=False)
def scan_for(self, document, scan_children=True):
"""
Find duplicates by matching latest version checksums
"""
Document = apps.get_model(
app_label='documents', model_name='Document'
)
# Get the documents whose latest version matches the checksum
# of the current document and exclude the current document
duplicates = Document.objects.annotate(
max_timestamp=Max('versions__timestamp')
).filter(
versions__timestamp=F('max_timestamp'),
versions__checksum=document.checksum
).exclude(pk=document.pk)
if duplicates.exists():
instance, created = self.get_or_create(document=document)
instance.documents.add(*duplicates)
if scan_children:
for document in duplicates:
self.scan_for(document=document, scan_children=False)
class PassthroughManager(models.Manager): class PassthroughManager(models.Manager):
pass pass

View File

@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-07-06 03:30
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0038_auto_20170705_2008'),
]
operations = [
migrations.CreateModel(
name='DuplicatedDocument',
fields=[
(
'id', models.AutoField(
auto_created=True, primary_key=True, serialize=False,
verbose_name='ID')
),
(
'datetime_added', models.DateTimeField(
auto_now_add=True, db_index=True,
verbose_name='Added')
),
(
'document', models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='duplicates', to='documents.Document',
verbose_name='Document')
),
(
'documents', models.ManyToManyField(
to='documents.Document',
verbose_name='Duplicated documents'
)
),
],
options={
'verbose_name': 'Duplicated document',
'verbose_name_plural': 'Duplicated documents',
},
),
]

View File

@@ -30,8 +30,8 @@ from .events import (
) )
from .literals import DEFAULT_DELETE_PERIOD, DEFAULT_DELETE_TIME_UNIT from .literals import DEFAULT_DELETE_PERIOD, DEFAULT_DELETE_TIME_UNIT
from .managers import ( from .managers import (
DocumentManager, DocumentTypeManager, PassthroughManager, DocumentManager, DocumentTypeManager, DuplicatedDocumentManager,
RecentDocumentManager, TrashCanManager PassthroughManager, RecentDocumentManager, TrashCanManager
) )
from .permissions import permission_document_view from .permissions import permission_document_view
from .runtime import cache_storage_backend, storage_backend from .runtime import cache_storage_backend, storage_backend
@@ -892,3 +892,25 @@ class RecentDocument(models.Model):
ordering = ('-datetime_accessed',) ordering = ('-datetime_accessed',)
verbose_name = _('Recent document') verbose_name = _('Recent document')
verbose_name_plural = _('Recent documents') verbose_name_plural = _('Recent documents')
@python_2_unicode_compatible
class DuplicatedDocument(models.Model):
document = models.ForeignKey(
Document, related_name='duplicates', verbose_name=_('Document')
)
documents = models.ManyToManyField(
Document, verbose_name=_('Duplicated documents')
)
datetime_added = models.DateTimeField(
auto_now_add=True, db_index=True, verbose_name=_('Added')
)
objects = DuplicatedDocumentManager()
def __str__(self):
return force_text(self.document)
class Meta:
verbose_name = _('Duplicated document')
verbose_name_plural = _('Duplicated documents')

View File

@@ -66,6 +66,29 @@ def task_generate_document_page_image(document_page_id, *args, **kwargs):
return document_page.generate_image(*args, **kwargs) return document_page.generate_image(*args, **kwargs)
@app.task(ignore_result=True)
def task_scan_duplicates_all():
DuplicatedDocument = apps.get_model(
app_label='documents', model_name='DuplicatedDocument'
)
DuplicatedDocument.objects.scan()
@app.task(ignore_result=True)
def task_scan_duplicates_for(document_id):
Document = apps.get_model(
app_label='documents', model_name='Document'
)
DuplicatedDocument = apps.get_model(
app_label='documents', model_name='DuplicatedDocument'
)
document = Document.objects.get(pk=document_id)
DuplicatedDocument.objects.scan_for(document=document)
@app.task(bind=True, default_retry_delay=UPDATE_PAGE_COUNT_RETRY_DELAY, ignore_result=True) @app.task(bind=True, default_retry_delay=UPDATE_PAGE_COUNT_RETRY_DELAY, ignore_result=True)
def task_update_page_count(self, version_id): def task_update_page_count(self, version_id):
DocumentVersion = apps.get_model( DocumentVersion = apps.get_model(

View File

@@ -735,3 +735,55 @@ class DeletedDocumentTestCase(GenericDocumentViewTestCase):
response = self.get('documents:document_list_deleted') response = self.get('documents:document_list_deleted')
self.assertContains(response, self.document.label, status_code=200) self.assertContains(response, self.document.label, status_code=200)
class DuplicatedDocumentsViewsTestCase(GenericDocumentViewTestCase):
def setUp(self):
super(DuplicatedDocumentsViewsTestCase, self).setUp()
self.login_user()
def _upload_duplicate_document(self):
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document_duplicate = self.document_type.new_document(
file_object=file_object, label=TEST_SMALL_DOCUMENT_FILENAME
)
def _request_duplicated_document_list(self):
return self.get('documents:duplicated_document_list')
def _request_document_duplicates_list(self):
return self.get(
'documents:document_duplicates_list', args=(self.document.pk,)
)
def test_duplicated_document_list_no_permissions(self):
self._upload_duplicate_document()
response = self._request_duplicated_document_list()
self.assertNotContains(
response, text=self.document.label, status_code=200
)
def test_duplicated_document_list_with_permissions(self):
self._upload_duplicate_document()
self.grant(permission=permission_document_view)
response = self._request_duplicated_document_list()
self.assertContains(
response, text=self.document.label, status_code=200
)
def test_document_duplicates_list_no_permissions(self):
self._upload_duplicate_document()
response = self._request_document_duplicates_list()
self.assertEqual(response.status_code, 403)
def test_document_duplicates_list_with_permissions(self):
self._upload_duplicate_document()
self.grant(permission=permission_document_view)
response = self._request_document_duplicates_list()
self.assertContains(
response, text=self.document.label, status_code=200
)

View File

@@ -16,8 +16,8 @@ from .views import (
ClearImageCacheView, DeletedDocumentDeleteView, ClearImageCacheView, DeletedDocumentDeleteView,
DeletedDocumentDeleteManyView, DeletedDocumentListView, DeletedDocumentDeleteManyView, DeletedDocumentListView,
DocumentDocumentTypeEditView, DocumentDownloadFormView, DocumentDocumentTypeEditView, DocumentDownloadFormView,
DocumentDownloadView, DocumentEditView, DocumentListView, DocumentDownloadView, DocumentDuplicatesListView, DocumentEditView,
DocumentPageListView, DocumentPageNavigationFirst, DocumentListView, DocumentPageListView, DocumentPageNavigationFirst,
DocumentPageNavigationLast, DocumentPageNavigationNext, DocumentPageNavigationLast, DocumentPageNavigationNext,
DocumentPageNavigationPrevious, DocumentPageRotateLeftView, DocumentPageNavigationPrevious, DocumentPageRotateLeftView,
DocumentPageRotateRightView, DocumentPageView, DocumentPageViewResetView, DocumentPageRotateRightView, DocumentPageView, DocumentPageViewResetView,
@@ -31,7 +31,8 @@ from .views import (
DocumentTypeListView, DocumentTypeEditView, DocumentUpdatePageCountView, DocumentTypeListView, DocumentTypeEditView, DocumentUpdatePageCountView,
DocumentVersionDownloadFormView, DocumentVersionDownloadView, DocumentVersionDownloadFormView, DocumentVersionDownloadView,
DocumentVersionListView, DocumentVersionRevertView, DocumentView, DocumentVersionListView, DocumentVersionRevertView, DocumentView,
EmptyTrashCanView, RecentDocumentListView DuplicatedDocumentListView, EmptyTrashCanView, RecentDocumentListView,
ScanDuplicatedDocuments
) )
@@ -45,7 +46,11 @@ urlpatterns = [
r'^list/deleted/$', DeletedDocumentListView.as_view(), r'^list/deleted/$', DeletedDocumentListView.as_view(),
name='document_list_deleted' name='document_list_deleted'
), ),
url(
r'^list/duplicated/$',
DuplicatedDocumentListView.as_view(),
name='duplicated_document_list'
),
url( url(
r'^(?P<pk>\d+)/preview/$', DocumentPreviewView.as_view(), r'^(?P<pk>\d+)/preview/$', DocumentPreviewView.as_view(),
name='document_preview' name='document_preview'
@@ -54,6 +59,10 @@ urlpatterns = [
r'^(?P<pk>\d+)/properties/$', DocumentView.as_view(), r'^(?P<pk>\d+)/properties/$', DocumentView.as_view(),
name='document_properties' name='document_properties'
), ),
url(
r'^(?P<pk>\d+)/duplicates/$', DocumentDuplicatesListView.as_view(),
name='document_duplicates_list'
),
url( url(
r'^(?P<pk>\d+)/restore/$', DocumentRestoreView.as_view(), r'^(?P<pk>\d+)/restore/$', DocumentRestoreView.as_view(),
name='document_restore' name='document_restore'
@@ -255,6 +264,14 @@ urlpatterns = [
DocumentTypeFilenameCreateView.as_view(), DocumentTypeFilenameCreateView.as_view(),
name='document_type_filename_create' name='document_type_filename_create'
), ),
# Tools
url(
r'^tools/documents/duplicated/scan/$',
ScanDuplicatedDocuments.as_view(),
name='duplicated_document_scan'
),
] ]
api_urls = [ api_urls = [

View File

@@ -19,6 +19,7 @@ from common.generics import (
SingleObjectDownloadView, SingleObjectEditView, SingleObjectListView SingleObjectDownloadView, SingleObjectEditView, SingleObjectListView
) )
from common.mixins import MultipleInstanceActionMixin from common.mixins import MultipleInstanceActionMixin
from common.utils import encapsulate
from converter.models import Transformation from converter.models import Transformation
from converter.permissions import ( from converter.permissions import (
permission_transformation_delete, permission_transformation_edit permission_transformation_delete, permission_transformation_edit
@@ -31,7 +32,9 @@ from ..forms import (
DocumentTypeSelectForm, DocumentTypeSelectForm,
) )
from ..literals import PAGE_RANGE_RANGE, DEFAULT_ZIP_FILENAME from ..literals import PAGE_RANGE_RANGE, DEFAULT_ZIP_FILENAME
from ..models import DeletedDocument, Document, RecentDocument from ..models import (
DeletedDocument, Document, DuplicatedDocument, RecentDocument
)
from ..permissions import ( from ..permissions import (
permission_document_delete, permission_document_download, permission_document_delete, permission_document_download,
permission_document_print, permission_document_properties_edit, permission_document_print, permission_document_properties_edit,
@@ -167,6 +170,36 @@ class DocumentDocumentTypeEditView(MultipleObjectFormActionView):
) )
class DocumentDuplicatesListView(DocumentListView):
def dispatch(self, request, *args, **kwargs):
AccessControlList.objects.check_access(
permissions=permission_document_view, user=self.request.user,
obj=self.get_document()
)
return super(
DocumentDuplicatesListView, self
).dispatch(request, *args, **kwargs)
def get_document(self):
return get_object_or_404(Document, pk=self.kwargs['pk'])
def get_queryset(self):
try:
return DuplicatedDocument.objects.get(
document=self.get_document()
).documents.all()
except DuplicatedDocument.DoesNotExist:
return Document.objects.none()
def get_extra_context(self):
return {
'hide_links': True,
'object': self.get_document(),
'title': _('Duplicates for document: %s') % self.get_document(),
}
class DocumentEditView(SingleObjectEditView): class DocumentEditView(SingleObjectEditView):
form_class = DocumentForm form_class = DocumentForm
model = Document model = Document
@@ -722,3 +755,27 @@ class DocumentPrint(FormView):
return ['documents/document_print.html'] return ['documents/document_print.html']
else: else:
return [self.template_name] return [self.template_name]
class DuplicatedDocumentListView(DocumentListView):
extra_context = {
'extra_columns': (
{
'name': _('Duplicates'),
'attribute': encapsulate(
lambda document: DuplicatedDocument.objects.get(
document=document
).documents.count()
)
},
),
'hide_links': True,
'title': _('Duplicated documents')
}
def get_document_queryset(self):
return Document.objects.filter(
pk__in=DuplicatedDocument.objects.values_list(
'document_id', flat=True
)
)

View File

@@ -8,7 +8,7 @@ from django.utils.translation import ugettext_lazy as _
from common.generics import ConfirmView from common.generics import ConfirmView
from ..permissions import permission_document_tools from ..permissions import permission_document_tools
from ..tasks import task_clear_image_cache from ..tasks import task_clear_image_cache, task_scan_duplicates_all
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -24,3 +24,16 @@ class ClearImageCacheView(ConfirmView):
messages.success( messages.success(
self.request, _('Document cache clearing queued successfully.') self.request, _('Document cache clearing queued successfully.')
) )
class ScanDuplicatedDocuments(ConfirmView):
extra_context = {
'title': _('Scan for duplicated documents?')
}
view_permission = permission_document_tools
def view_action(self):
task_scan_duplicates_all.apply_async()
messages.success(
self.request, _('Duplicated document scan queued successfully.')
)