Add duplicated document scan support.
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -5,6 +5,7 @@ import logging
|
||||
|
||||
from django.apps import apps
|
||||
from django.db import models
|
||||
from django.db.models import F, Max
|
||||
from django.utils.timezone import now
|
||||
|
||||
from .literals import STUB_EXPIRATION_INTERVAL
|
||||
@@ -98,6 +99,45 @@ class DocumentTypeManager(models.Manager):
|
||||
return self.get(label=label)
|
||||
|
||||
|
||||
class DuplicatedDocumentManager(models.Manager):
|
||||
def scan(self):
|
||||
"""
|
||||
Find duplicates by iterating over all documents and then
|
||||
find matching latest version checksums
|
||||
"""
|
||||
Document = apps.get_model(
|
||||
app_label='documents', model_name='Document'
|
||||
)
|
||||
|
||||
for document in Document.objects.all():
|
||||
self.scan_for(document=document, scan_children=False)
|
||||
|
||||
def scan_for(self, document, scan_children=True):
|
||||
"""
|
||||
Find duplicates by matching latest version checksums
|
||||
"""
|
||||
Document = apps.get_model(
|
||||
app_label='documents', model_name='Document'
|
||||
)
|
||||
|
||||
# Get the documents whose latest version matches the checksum
|
||||
# of the current document and exclude the current document
|
||||
duplicates = Document.objects.annotate(
|
||||
max_timestamp=Max('versions__timestamp')
|
||||
).filter(
|
||||
versions__timestamp=F('max_timestamp'),
|
||||
versions__checksum=document.checksum
|
||||
).exclude(pk=document.pk)
|
||||
|
||||
if duplicates.exists():
|
||||
instance, created = self.get_or_create(document=document)
|
||||
instance.documents.add(*duplicates)
|
||||
|
||||
if scan_children:
|
||||
for document in duplicates:
|
||||
self.scan_for(document=document, scan_children=False)
|
||||
|
||||
|
||||
class PassthroughManager(models.Manager):
|
||||
pass
|
||||
|
||||
|
||||
Reference in New Issue
Block a user