Add duplicated document scan support.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2017-07-06 02:56:35 -04:00
parent 55eedc153e
commit d4e1a506ed
13 changed files with 352 additions and 16 deletions

View File

@@ -5,6 +5,7 @@ import logging
from django.apps import apps
from django.db import models
from django.db.models import F, Max
from django.utils.timezone import now
from .literals import STUB_EXPIRATION_INTERVAL
@@ -98,6 +99,45 @@ class DocumentTypeManager(models.Manager):
return self.get(label=label)
class DuplicatedDocumentManager(models.Manager):
def scan(self):
"""
Find duplicates by iterating over all documents and then
find matching latest version checksums
"""
Document = apps.get_model(
app_label='documents', model_name='Document'
)
for document in Document.objects.all():
self.scan_for(document=document, scan_children=False)
def scan_for(self, document, scan_children=True):
"""
Find duplicates by matching latest version checksums
"""
Document = apps.get_model(
app_label='documents', model_name='Document'
)
# Get the documents whose latest version matches the checksum
# of the current document and exclude the current document
duplicates = Document.objects.annotate(
max_timestamp=Max('versions__timestamp')
).filter(
versions__timestamp=F('max_timestamp'),
versions__checksum=document.checksum
).exclude(pk=document.pk)
if duplicates.exists():
instance, created = self.get_or_create(document=document)
instance.documents.add(*duplicates)
if scan_children:
for document in duplicates:
self.scan_for(document=document, scan_children=False)
class PassthroughManager(models.Manager):
pass