Add duplicated document scan support.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-07-06 02:56:35 -04:00
parent 55eedc153e
commit d4e1a506ed
13 changed files with 352 additions and 16 deletions

View File

@@ -54,6 +54,14 @@ Other Changes
GitLab issue #373.
- Add support to search documents by their checksums.
- The document checksum field is now indexed for faster searches by checksum.
- Add support for duplicated document scanning. Every time a document is
uploaded, a document scan will be triggered to determine if the new document
is a duplicate of an existing document. Duplicate documents will be listed
in a new "Duplicated documents" link in the main menu. A full document list
scan can also be triggered by using the new "Duplicated document scan" button
in the tools menu. Finally a new tab in the document view has been added
called "Duplicates" that will list all duplicates of the currently
selected document.
Removals
--------

View File

@@ -4,7 +4,7 @@ from django.contrib import admin
from .models import (
DeletedDocument, Document, DocumentPage, DocumentType,
DocumentTypeFilename, DocumentVersion, RecentDocument
DocumentTypeFilename, DocumentVersion, DuplicatedDocument, RecentDocument
)
@@ -55,6 +55,13 @@ class DocumentTypeAdmin(admin.ModelAdmin):
)
@admin.register(DuplicatedDocument)
class DuplicatedDocumentAdmin(admin.ModelAdmin):
list_display = (
'document', 'datetime_added'
)
@admin.register(RecentDocument)
class RecentDocumentAdmin(admin.ModelAdmin):
date_hierarchy = 'datetime_accessed'

View File

@@ -31,11 +31,13 @@ from rest_api.classes import APIEndPoint
from rest_api.fields import DynamicSerializerField
from statistics.classes import StatisticNamespace, CharJSLine
from .handlers import create_default_document_type
from .handlers import (
create_default_document_type, handler_scan_duplicates_for
)
from .links import (
link_clear_image_cache, link_document_clear_transformations,
link_document_clone_transformations, link_document_delete,
link_document_document_type_edit,
link_document_document_type_edit, link_document_duplicates_list,
link_document_multiple_document_type_edit, link_document_download,
link_document_edit, link_document_list, link_document_list_deleted,
link_document_list_recent, link_document_multiple_delete,
@@ -55,7 +57,8 @@ from .links import (
link_document_type_filename_list, link_document_type_list,
link_document_type_setup, link_document_update_page_count,
link_document_version_download, link_document_version_list,
link_document_version_revert, link_trash_can_empty
link_document_version_revert, link_duplicated_document_list,
link_duplicated_document_scan, link_trash_can_empty
)
from .literals import (
CHECK_DELETE_PERIOD_INTERVAL, CHECK_TRASH_PERIOD_INTERVAL,
@@ -73,6 +76,7 @@ from .permissions import (
from .queues import * # NOQA
# Just import to initialize the search models
from .search import document_search, document_page_search # NOQA
from .signals import post_version_upload
from .statistics import (
new_documents_per_month, new_document_pages_per_month,
new_document_pages_this_month, new_documents_this_month,
@@ -100,6 +104,7 @@ class DocumentsApp(MayanAppConfig):
DocumentType = self.get_model('DocumentType')
DocumentTypeFilename = self.get_model('DocumentTypeFilename')
DocumentVersion = self.get_model('DocumentVersion')
DuplicatedDocument = self.get_model('DuplicatedDocument')
DynamicSerializerField.add_serializer(
klass=Document,
@@ -271,6 +276,16 @@ class DocumentsApp(MayanAppConfig):
source=DocumentVersion, label=_('Comment'),
attribute='comment'
)
SourceColumn(
source=DuplicatedDocument, label=_('Thumbnail'),
func=lambda context: document_thumbnail_widget.render(
instance=context['object'].document
)
)
SourceColumn(
source=DuplicatedDocument, label=_('Duplicates'),
func=lambda context: context['object'].documents.count()
)
app.conf.CELERYBEAT_SCHEDULE.update(
{
@@ -328,20 +343,28 @@ class DocumentsApp(MayanAppConfig):
'documents.tasks.task_upload_new_version': {
'queue': 'uploads'
},
'documents.tasks.task_scan_duplicates_all': {
'queue': 'tools'
},
'documents.tasks.task_scan_duplicates_for': {
'queue': 'uploads'
},
}
)
menu_documents.bind_links(
links=(
link_document_list_recent, link_document_list,
link_document_list_deleted
link_document_list_deleted, link_duplicated_document_list
)
)
menu_main.bind_links(links=(menu_documents,), position=0)
menu_setup.bind_links(links=(link_document_type_setup,))
menu_tools.bind_links(links=(link_clear_image_cache,))
menu_tools.bind_links(
links=(link_clear_image_cache, link_duplicated_document_scan)
)
# Document type links
menu_object.bind_links(
@@ -384,7 +407,7 @@ class DocumentsApp(MayanAppConfig):
link_document_print, link_document_trash,
link_document_download, link_document_clear_transformations,
link_document_clone_transformations,
link_document_update_page_count
link_document_update_page_count,
), sources=(Document,)
)
menu_object.bind_links(
@@ -393,7 +416,10 @@ class DocumentsApp(MayanAppConfig):
)
# Document facet links
menu_facet.bind_links(links=(link_acl_list,), sources=(Document,))
menu_facet.bind_links(
links=(link_document_duplicates_list, link_acl_list,),
sources=(Document,)
)
menu_facet.bind_links(
links=(link_document_preview,), sources=(Document,), position=0
)
@@ -499,6 +525,10 @@ class DocumentsApp(MayanAppConfig):
create_default_document_type,
dispatch_uid='create_default_document_type'
)
post_version_upload.connect(
handler_scan_duplicates_for,
dispatch_uid='handler_scan_duplicates_for',
)
registry.register(DeletedDocument)
registry.register(Document)

View File

@@ -4,6 +4,7 @@ from django.apps import apps
from .literals import DEFAULT_DOCUMENT_TYPE_LABEL
from .signals import post_initial_document_type
from .tasks import task_scan_duplicates_for
def create_default_document_type(sender, **kwargs):
@@ -18,3 +19,9 @@ def create_default_document_type(sender, **kwargs):
post_initial_document_type.send(
sender=DocumentType, instance=document_type
)
def handler_scan_duplicates_for(sender, instance, **kwargs):
task_scan_duplicates_for.apply_async(
kwargs={'document_id': instance.document.pk}
)

View File

@@ -284,3 +284,16 @@ link_document_type_setup = Link(
icon='fa fa-file', permissions=(permission_document_type_view,),
text=_('Document types'), view='documents:document_type_list'
)
link_duplicated_document_list = Link(
icon='fa fa-clone', text=_('Duplicated documents'),
view='documents:duplicated_document_list'
)
link_document_duplicates_list = Link(
args='resolved_object.id', icon='fa fa-clone',
permissions=(permission_document_view,), text=_('Duplicates'),
view='documents:document_duplicates_list',
)
link_duplicated_document_scan = Link(
icon='fa fa-clone', text=_('Duplicated document scan'),
view='documents:duplicated_document_scan'
)

View File

@@ -5,6 +5,7 @@ import logging
from django.apps import apps
from django.db import models
from django.db.models import F, Max
from django.utils.timezone import now
from .literals import STUB_EXPIRATION_INTERVAL
@@ -98,6 +99,45 @@ class DocumentTypeManager(models.Manager):
return self.get(label=label)
class DuplicatedDocumentManager(models.Manager):
def scan(self):
"""
Find duplicates by iterating over all documents and then
find matching latest version checksums
"""
Document = apps.get_model(
app_label='documents', model_name='Document'
)
for document in Document.objects.all():
self.scan_for(document=document, scan_children=False)
def scan_for(self, document, scan_children=True):
"""
Find duplicates by matching latest version checksums
"""
Document = apps.get_model(
app_label='documents', model_name='Document'
)
# Get the documents whose latest version matches the checksum
# of the current document and exclude the current document
duplicates = Document.objects.annotate(
max_timestamp=Max('versions__timestamp')
).filter(
versions__timestamp=F('max_timestamp'),
versions__checksum=document.checksum
).exclude(pk=document.pk)
if duplicates.exists():
instance, created = self.get_or_create(document=document)
instance.documents.add(*duplicates)
if scan_children:
for document in duplicates:
self.scan_for(document=document, scan_children=False)
class PassthroughManager(models.Manager):
pass

View File

@@ -0,0 +1,47 @@
# -*- coding: utf-8 -*-
# Generated by Django 1.10.7 on 2017-07-06 03:30
from __future__ import unicode_literals
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('documents', '0038_auto_20170705_2008'),
]
operations = [
migrations.CreateModel(
name='DuplicatedDocument',
fields=[
(
'id', models.AutoField(
auto_created=True, primary_key=True, serialize=False,
verbose_name='ID')
),
(
'datetime_added', models.DateTimeField(
auto_now_add=True, db_index=True,
verbose_name='Added')
),
(
'document', models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name='duplicates', to='documents.Document',
verbose_name='Document')
),
(
'documents', models.ManyToManyField(
to='documents.Document',
verbose_name='Duplicated documents'
)
),
],
options={
'verbose_name': 'Duplicated document',
'verbose_name_plural': 'Duplicated documents',
},
),
]

View File

@@ -30,8 +30,8 @@ from .events import (
)
from .literals import DEFAULT_DELETE_PERIOD, DEFAULT_DELETE_TIME_UNIT
from .managers import (
DocumentManager, DocumentTypeManager, PassthroughManager,
RecentDocumentManager, TrashCanManager
DocumentManager, DocumentTypeManager, DuplicatedDocumentManager,
PassthroughManager, RecentDocumentManager, TrashCanManager
)
from .permissions import permission_document_view
from .runtime import cache_storage_backend, storage_backend
@@ -892,3 +892,25 @@ class RecentDocument(models.Model):
ordering = ('-datetime_accessed',)
verbose_name = _('Recent document')
verbose_name_plural = _('Recent documents')
@python_2_unicode_compatible
class DuplicatedDocument(models.Model):
document = models.ForeignKey(
Document, related_name='duplicates', verbose_name=_('Document')
)
documents = models.ManyToManyField(
Document, verbose_name=_('Duplicated documents')
)
datetime_added = models.DateTimeField(
auto_now_add=True, db_index=True, verbose_name=_('Added')
)
objects = DuplicatedDocumentManager()
def __str__(self):
return force_text(self.document)
class Meta:
verbose_name = _('Duplicated document')
verbose_name_plural = _('Duplicated documents')

View File

@@ -66,6 +66,29 @@ def task_generate_document_page_image(document_page_id, *args, **kwargs):
return document_page.generate_image(*args, **kwargs)
@app.task(ignore_result=True)
def task_scan_duplicates_all():
DuplicatedDocument = apps.get_model(
app_label='documents', model_name='DuplicatedDocument'
)
DuplicatedDocument.objects.scan()
@app.task(ignore_result=True)
def task_scan_duplicates_for(document_id):
Document = apps.get_model(
app_label='documents', model_name='Document'
)
DuplicatedDocument = apps.get_model(
app_label='documents', model_name='DuplicatedDocument'
)
document = Document.objects.get(pk=document_id)
DuplicatedDocument.objects.scan_for(document=document)
@app.task(bind=True, default_retry_delay=UPDATE_PAGE_COUNT_RETRY_DELAY, ignore_result=True)
def task_update_page_count(self, version_id):
DocumentVersion = apps.get_model(

View File

@@ -735,3 +735,55 @@ class DeletedDocumentTestCase(GenericDocumentViewTestCase):
response = self.get('documents:document_list_deleted')
self.assertContains(response, self.document.label, status_code=200)
class DuplicatedDocumentsViewsTestCase(GenericDocumentViewTestCase):
def setUp(self):
super(DuplicatedDocumentsViewsTestCase, self).setUp()
self.login_user()
def _upload_duplicate_document(self):
with open(TEST_SMALL_DOCUMENT_PATH) as file_object:
self.document_duplicate = self.document_type.new_document(
file_object=file_object, label=TEST_SMALL_DOCUMENT_FILENAME
)
def _request_duplicated_document_list(self):
return self.get('documents:duplicated_document_list')
def _request_document_duplicates_list(self):
return self.get(
'documents:document_duplicates_list', args=(self.document.pk,)
)
def test_duplicated_document_list_no_permissions(self):
self._upload_duplicate_document()
response = self._request_duplicated_document_list()
self.assertNotContains(
response, text=self.document.label, status_code=200
)
def test_duplicated_document_list_with_permissions(self):
self._upload_duplicate_document()
self.grant(permission=permission_document_view)
response = self._request_duplicated_document_list()
self.assertContains(
response, text=self.document.label, status_code=200
)
def test_document_duplicates_list_no_permissions(self):
self._upload_duplicate_document()
response = self._request_document_duplicates_list()
self.assertEqual(response.status_code, 403)
def test_document_duplicates_list_with_permissions(self):
self._upload_duplicate_document()
self.grant(permission=permission_document_view)
response = self._request_document_duplicates_list()
self.assertContains(
response, text=self.document.label, status_code=200
)

View File

@@ -16,8 +16,8 @@ from .views import (
ClearImageCacheView, DeletedDocumentDeleteView,
DeletedDocumentDeleteManyView, DeletedDocumentListView,
DocumentDocumentTypeEditView, DocumentDownloadFormView,
DocumentDownloadView, DocumentEditView, DocumentListView,
DocumentPageListView, DocumentPageNavigationFirst,
DocumentDownloadView, DocumentDuplicatesListView, DocumentEditView,
DocumentListView, DocumentPageListView, DocumentPageNavigationFirst,
DocumentPageNavigationLast, DocumentPageNavigationNext,
DocumentPageNavigationPrevious, DocumentPageRotateLeftView,
DocumentPageRotateRightView, DocumentPageView, DocumentPageViewResetView,
@@ -31,7 +31,8 @@ from .views import (
DocumentTypeListView, DocumentTypeEditView, DocumentUpdatePageCountView,
DocumentVersionDownloadFormView, DocumentVersionDownloadView,
DocumentVersionListView, DocumentVersionRevertView, DocumentView,
EmptyTrashCanView, RecentDocumentListView
DuplicatedDocumentListView, EmptyTrashCanView, RecentDocumentListView,
ScanDuplicatedDocuments
)
@@ -45,7 +46,11 @@ urlpatterns = [
r'^list/deleted/$', DeletedDocumentListView.as_view(),
name='document_list_deleted'
),
url(
r'^list/duplicated/$',
DuplicatedDocumentListView.as_view(),
name='duplicated_document_list'
),
url(
r'^(?P<pk>\d+)/preview/$', DocumentPreviewView.as_view(),
name='document_preview'
@@ -54,6 +59,10 @@ urlpatterns = [
r'^(?P<pk>\d+)/properties/$', DocumentView.as_view(),
name='document_properties'
),
url(
r'^(?P<pk>\d+)/duplicates/$', DocumentDuplicatesListView.as_view(),
name='document_duplicates_list'
),
url(
r'^(?P<pk>\d+)/restore/$', DocumentRestoreView.as_view(),
name='document_restore'
@@ -255,6 +264,14 @@ urlpatterns = [
DocumentTypeFilenameCreateView.as_view(),
name='document_type_filename_create'
),
# Tools
url(
r'^tools/documents/duplicated/scan/$',
ScanDuplicatedDocuments.as_view(),
name='duplicated_document_scan'
),
]
api_urls = [

View File

@@ -19,6 +19,7 @@ from common.generics import (
SingleObjectDownloadView, SingleObjectEditView, SingleObjectListView
)
from common.mixins import MultipleInstanceActionMixin
from common.utils import encapsulate
from converter.models import Transformation
from converter.permissions import (
permission_transformation_delete, permission_transformation_edit
@@ -31,7 +32,9 @@ from ..forms import (
DocumentTypeSelectForm,
)
from ..literals import PAGE_RANGE_RANGE, DEFAULT_ZIP_FILENAME
from ..models import DeletedDocument, Document, RecentDocument
from ..models import (
DeletedDocument, Document, DuplicatedDocument, RecentDocument
)
from ..permissions import (
permission_document_delete, permission_document_download,
permission_document_print, permission_document_properties_edit,
@@ -167,6 +170,36 @@ class DocumentDocumentTypeEditView(MultipleObjectFormActionView):
)
class DocumentDuplicatesListView(DocumentListView):
def dispatch(self, request, *args, **kwargs):
AccessControlList.objects.check_access(
permissions=permission_document_view, user=self.request.user,
obj=self.get_document()
)
return super(
DocumentDuplicatesListView, self
).dispatch(request, *args, **kwargs)
def get_document(self):
return get_object_or_404(Document, pk=self.kwargs['pk'])
def get_queryset(self):
try:
return DuplicatedDocument.objects.get(
document=self.get_document()
).documents.all()
except DuplicatedDocument.DoesNotExist:
return Document.objects.none()
def get_extra_context(self):
return {
'hide_links': True,
'object': self.get_document(),
'title': _('Duplicates for document: %s') % self.get_document(),
}
class DocumentEditView(SingleObjectEditView):
form_class = DocumentForm
model = Document
@@ -722,3 +755,27 @@ class DocumentPrint(FormView):
return ['documents/document_print.html']
else:
return [self.template_name]
class DuplicatedDocumentListView(DocumentListView):
extra_context = {
'extra_columns': (
{
'name': _('Duplicates'),
'attribute': encapsulate(
lambda document: DuplicatedDocument.objects.get(
document=document
).documents.count()
)
},
),
'hide_links': True,
'title': _('Duplicated documents')
}
def get_document_queryset(self):
return Document.objects.filter(
pk__in=DuplicatedDocument.objects.values_list(
'document_id', flat=True
)
)

View File

@@ -8,7 +8,7 @@ from django.utils.translation import ugettext_lazy as _
from common.generics import ConfirmView
from ..permissions import permission_document_tools
from ..tasks import task_clear_image_cache
from ..tasks import task_clear_image_cache, task_scan_duplicates_all
logger = logging.getLogger(__name__)
@@ -24,3 +24,16 @@ class ClearImageCacheView(ConfirmView):
messages.success(
self.request, _('Document cache clearing queued successfully.')
)
class ScanDuplicatedDocuments(ConfirmView):
extra_context = {
'title': _('Scan for duplicated documents?')
}
view_permission = permission_document_tools
def view_action(self):
task_scan_duplicates_all.apply_async()
messages.success(
self.request, _('Duplicated document scan queued successfully.')
)