Initial set of model, form and API changes to support document versions

This commit is contained in:
Roberto Rosario
2011-12-02 02:51:59 -04:00
parent b38d84f663
commit d83e8b5428
5 changed files with 182 additions and 41 deletions

View File

@@ -35,7 +35,7 @@ def is_first_page(context):
def is_last_page(context): def is_last_page(context):
return context['page'].page_number >= context['page'].document.documentpage_set.count() return context['page'].page_number >= context['page'].document_version.pages.count()
def is_min_zoom(context): def is_min_zoom(context):

View File

@@ -2,9 +2,27 @@ from django.contrib import admin
from metadata.admin import DocumentMetadataInline from metadata.admin import DocumentMetadataInline
from documents.models import DocumentType, Document, \ from documents.models import (DocumentType, Document,
DocumentTypeFilename, DocumentPage, \ DocumentTypeFilename, DocumentPage,
DocumentPageTransformation, RecentDocument DocumentPageTransformation, RecentDocument,
DocumentVersion)
class DocumentPageInline(admin.StackedInline):
model = DocumentPage
extra = 1
classes = ('collapse-open',)
allow_add = True
class DocumentVersionInline(admin.StackedInline):
model = DocumentVersion
extra = 1
classes = ('collapse-open',)
allow_add = True
inlines = [
DocumentPageInline,
]
class DocumentTypeFilenameInline(admin.StackedInline): class DocumentTypeFilenameInline(admin.StackedInline):
@@ -24,16 +42,9 @@ class DocumentPageTransformationAdmin(admin.ModelAdmin):
model = DocumentPageTransformation model = DocumentPageTransformation
class DocumentPageInline(admin.StackedInline):
model = DocumentPage
extra = 1
classes = ('collapse-open',)
allow_add = True
class DocumentAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin):
inlines = [ inlines = [
DocumentMetadataInline, DocumentPageInline DocumentMetadataInline, DocumentVersionInline
] ]
list_display = ('uuid', 'file_filename',) list_display = ('uuid', 'file_filename',)

View File

@@ -100,7 +100,7 @@ class DocumentPagesCarouselWidget(forms.widgets.Widget):
output = [] output = []
output.append(u'<div style="white-space:nowrap; overflow: auto;">') output.append(u'<div style="white-space:nowrap; overflow: auto;">')
for page in value.documentpage_set.all(): for page in value.pages.all():
output.append(u'<div style="display: inline-block; margin: 5px 10px 10px 10px;">') output.append(u'<div style="display: inline-block; margin: 5px 10px 10px 10px;">')
output.append( output.append(
document_html_widget( document_html_widget(
@@ -128,7 +128,7 @@ class DocumentPreviewForm(forms.Form):
document = kwargs.pop('document', None) document = kwargs.pop('document', None)
super(DocumentPreviewForm, self).__init__(*args, **kwargs) super(DocumentPreviewForm, self).__init__(*args, **kwargs)
self.fields['preview'].initial = document self.fields['preview'].initial = document
self.fields['preview'].label = _(u'Document pages (%s)') % document.documentpage_set.count() self.fields['preview'].label = _(u'Document pages (%s)') % document.pages.count()
preview = forms.CharField(widget=DocumentPagesCarouselWidget()) preview = forms.CharField(widget=DocumentPagesCarouselWidget())
@@ -198,7 +198,7 @@ class DocumentContentForm(forms.Form):
super(DocumentContentForm, self).__init__(*args, **kwargs) super(DocumentContentForm, self).__init__(*args, **kwargs)
content = [] content = []
self.fields['contents'].initial = u'' self.fields['contents'].initial = u''
for page in self.document.documentpage_set.all(): for page in self.document.pages.all():
if page.content: if page.content:
content.append(page.content) content.append(page.content)
content.append(u'\n\n\n - Page %s - \n\n\n' % page.page_number) content.append(u'\n\n\n - Page %s - \n\n\n' % page.page_number)

View File

@@ -72,17 +72,20 @@ class Document(models.Model):
""" """
Defines a single document with it's fields and properties Defines a single document with it's fields and properties
""" """
# Base fields
document_type = models.ForeignKey(DocumentType, verbose_name=_(u'document type'), null=True, blank=True) document_type = models.ForeignKey(DocumentType, verbose_name=_(u'document type'), null=True, blank=True)
file = models.FileField(upload_to=get_filename_from_uuid, storage=STORAGE_BACKEND(), verbose_name=_(u'file'))
uuid = models.CharField(max_length=48, default=UUID_FUNCTION(), blank=True, editable=False) uuid = models.CharField(max_length=48, default=UUID_FUNCTION(), blank=True, editable=False)
file_mimetype = models.CharField(max_length=64, default='', editable=False)
file_mime_encoding = models.CharField(max_length=64, default='', editable=False)
#FAT filename can be up to 255 using LFN
file_filename = models.CharField(max_length=255, default=u'', editable=False, db_index=True)
date_added = models.DateTimeField(verbose_name=_(u'added'), auto_now_add=True, db_index=True)
date_updated = models.DateTimeField(verbose_name=_(u'updated'), auto_now=True)
checksum = models.TextField(blank=True, null=True, verbose_name=_(u'checksum'), editable=False)
description = models.TextField(blank=True, null=True, verbose_name=_(u'description'), db_index=True) description = models.TextField(blank=True, null=True, verbose_name=_(u'description'), db_index=True)
date_added = models.DateTimeField(verbose_name=_(u'added'), auto_now_add=True, db_index=True)
## Fields to migrate
#file = models.FileField(upload_to=get_filename_from_uuid, storage=STORAGE_BACKEND(), verbose_name=_(u'file'))
#file_mimetype = models.CharField(max_length=64, default='', editable=False)
#file_mime_encoding = models.CharField(max_length=64, default='', editable=False)
##FAT filename can be up to 255 using LFN
#file_filename = models.CharField(max_length=255, default=u'', editable=False, db_index=True)
#date_updated = models.DateTimeField(verbose_name=_(u'updated'), auto_now=True)
#checksum = models.TextField(blank=True, null=True, verbose_name=_(u'checksum'), editable=False)
tags = TaggableManager() tags = TaggableManager()
@@ -155,7 +158,8 @@ class Document(models.Model):
Return a file descriptor to a document's file irrespective of Return a file descriptor to a document's file irrespective of
the storage backend the storage backend
""" """
return self.file.storage.open(self.file.path) #return self.file.storage.open(self.file.path)
return self.get_latest_version().file.storage.open(self.get_latest_version().file.path)
def update_checksum(self, save=True): def update_checksum(self, save=True):
""" """
@@ -163,11 +167,11 @@ class Document(models.Model):
user provided checksum function user provided checksum function
""" """
if self.exists(): if self.exists():
source = self.open() source = self.get_latest_version().open()
self.checksum = unicode(CHECKSUM_FUNCTION(source.read())) self.get_latest_version().checksum = unicode(CHECKSUM_FUNCTION(source.read()))
source.close() source.close()
if save: if save:
self.save() self.get_latest_version().save()
def update_page_count(self, save=True): def update_page_count(self, save=True):
handle, filepath = tempfile.mkstemp() handle, filepath = tempfile.mkstemp()
@@ -204,7 +208,8 @@ class Document(models.Model):
@property @property
def page_count(self): def page_count(self):
return self.documentpage_set.count() #return self.documentpage_set.count()
return self.get_latest_version().documentpage_set.count()
def save_to_file(self, filepath, buffer_size=1024 * 1024): def save_to_file(self, filepath, buffer_size=1024 * 1024):
""" """
@@ -229,13 +234,13 @@ class Document(models.Model):
Returns a boolean value that indicates if the document's file Returns a boolean value that indicates if the document's file
exists in storage exists in storage
""" """
return self.file.storage.exists(self.file.path) return self.get_latest_version().file.storage.exists(self.get_latest_version().file.path)
def apply_default_transformations(self, transformations): def apply_default_transformations(self, transformations):
#Only apply default transformations on new documents #Only apply default transformations on new documents
if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0: if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.pages.all()]) == 0:
for transformation in transformations: for transformation in transformations:
for document_page in self.documentpage_set.all(): for document_page in self.pages.all():
page_transformation = DocumentPageTransformation( page_transformation = DocumentPageTransformation(
document_page=document_page, document_page=document_page,
order=0, order=0,
@@ -246,7 +251,7 @@ class Document(models.Model):
page_transformation.save() page_transformation.save()
def get_cached_image_name(self, page): def get_cached_image_name(self, page):
document_page = self.documentpage_set.get(page_number=page) document_page = self.pages.get(page_number=page)
transformations, warnings = document_page.get_transformation_list() transformations, warnings = document_page.get_transformation_list()
hash_value = HASH_FUNCTION(u''.join([self.checksum, unicode(page), unicode(transformations)])) hash_value = HASH_FUNCTION(u''.join([self.checksum, unicode(page), unicode(transformations)]))
return os.path.join(CACHE_PATH, hash_value), transformations return os.path.join(CACHE_PATH, hash_value), transformations
@@ -300,15 +305,129 @@ class Document(models.Model):
def delete(self, *args, **kwargs): def delete(self, *args, **kwargs):
super(Document, self).delete(*args, **kwargs) super(Document, self).delete(*args, **kwargs)
return self.file.storage.delete(self.file.path) for version in self.documentversion_set.all():
version.file.storage.delete(version.file.path)
#return self.get_latest_version().file.storage.delete(self.get_latest_version().file.path)
@property @property
def size(self): def size(self):
if self.exists(): if self.exists():
return self.file.storage.size(self.file.path) return self.get_latest_version().file.storage.size(self.get_latest_version().file.path)
else: else:
return None return None
# Compatibiliy methods
@property
def file(self):
return self.get_latest_version().file
@property
def file_mimetype(self):
return self.get_latest_version().mimetype
@property
def file_mime_encoding(self):
return self.get_latest_version().encoding
@property
def file_filename(self):
return self.get_latest_version().filename
@property
def date_updated(self):
return self.get_latest_version().timestamp
#@property
#def date_added(self):
# return self.get_latest_version().timestamp
@property
def checksum(self):
return self.get_latest_version().checksum
@property
def pages(self):
return self.get_latest_version().pages
#file = models.FileField(upload_to=get_filename_from_uuid, storage=STORAGE_BACKEND(), verbose_name=_(u'file'))
#file_mimetype = models.CharField(max_length=64, default='', editable=False)
#file_mime_encoding = models.CharField(max_length=64, default='', editable=False)
##FAT filename can be up to 255 using LFN
#file_filename = models.CharField(max_length=255, default=u'', editable=False, db_index=True)
#date_updated = models.DateTimeField(verbose_name=_(u'updated'), auto_now=True)
#checksum = models.TextField(blank=True, null=True, verbose_name=_(u'checksum'), editable=False)
def get_latest_version(self):
return self.documentversion_set.order_by('-timestamp')[0]
RELEASE_LEVEL_FINAL = 1
RELEASE_LEVEL_ALPHA = 2
RELEASE_LEVEL_BETA = 3
RELEASE_LEVEL_RC = 4
RELEASE_LEVEL_HF = 5
RELEASE_LEVEL_CHOICES = (
(RELEASE_LEVEL_FINAL, _(u'final')),
(RELEASE_LEVEL_ALPHA, _(u'alpha')),
(RELEASE_LEVEL_BETA, _(u'beta')),
(RELEASE_LEVEL_RC, _(u'release candidate')),
(RELEASE_LEVEL_HF, _(u'hotfix')),
)
class DocumentVersion(models.Model):
'''
Model that describes a document version and it properties
'''
document = models.ForeignKey(Document, verbose_name=_(u'document'))
mayor = models.PositiveIntegerField(verbose_name=_(u'mayor'), default=1)
minor = models.PositiveIntegerField(verbose_name=_(u'minor'), default=0)
micro = models.PositiveIntegerField(verbose_name=_(u'micro'), default=0)
release_level = models.PositiveIntegerField(choices=RELEASE_LEVEL_CHOICES, default=RELEASE_LEVEL_FINAL, verbose_name=_(u'release level'))
serial = models.PositiveIntegerField(verbose_name=_(u'serial'), default=0)
timestamp = models.DateTimeField(verbose_name=_(u'timestamp'))
# File related fields
file = models.FileField(upload_to=get_filename_from_uuid, storage=STORAGE_BACKEND(), verbose_name=_(u'file'))
mimetype = models.CharField(max_length=64, default='', editable=False)
encoding = models.CharField(max_length=64, default='', editable=False)
filename = models.CharField(max_length=255, default=u'', editable=False, db_index=True)
checksum = models.TextField(blank=True, null=True, verbose_name=_(u'checksum'), editable=False)
class Meta:
unique_together = ('document', 'mayor', 'minor', 'micro', 'release_level', 'serial')
verbose_name = _(u'document version')
verbose_name_plural = _(u'document version')
def __unicode__(self):
return self.get_version()
# TODO: Update timestamp
def get_version():
'''
Return the formatted version information
'''
vers = [u'%(major)i.%(minor)i' % self, ]
if self.micro:
vers.append(u'.%(micro)i' % self)
if self.releaselevel != RELEASE_LEVEL_FINAL:
vers.append(u'%(releaselevel)s%(serial)i' % self)
return u''.join(vers)
@property
def pages(self):
return self.documentpage_set
def open(self):
'''
Return a file descriptor to a document version's file irrespective of
the storage backend
'''
return self.file.storage.open(self.file.path)
class DocumentTypeFilename(models.Model): class DocumentTypeFilename(models.Model):
""" """
@@ -332,7 +451,13 @@ class DocumentPage(models.Model):
""" """
Model that describes a document page including it's content Model that describes a document page including it's content
""" """
document = models.ForeignKey(Document, verbose_name=_(u'document')) ## This field is to be removed
#document = models.ForeignKey(Document, verbose_name=_(u'document'))
# New parent field
document_version = models.ForeignKey(DocumentVersion, verbose_name=_(u'document version'))#, null=True, blank=True) # TODO: Remove these after datamigration
# Unchanged fields
content = models.TextField(blank=True, null=True, verbose_name=_(u'content'), db_index=True) content = models.TextField(blank=True, null=True, verbose_name=_(u'content'), db_index=True)
page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label')) page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label'))
page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'), db_index=True) page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'), db_index=True)
@@ -341,7 +466,7 @@ class DocumentPage(models.Model):
return _(u'Page %(page_num)d out of %(total_pages)d of %(document)s') % { return _(u'Page %(page_num)d out of %(total_pages)d of %(document)s') % {
'document': unicode(self.document), 'document': unicode(self.document),
'page_num': self.page_number, 'page_num': self.page_number,
'total_pages': self.document.documentpage_set.count() 'total_pages': self.document_version.documentpage_set.count()
} }
class Meta: class Meta:
@@ -356,6 +481,11 @@ class DocumentPage(models.Model):
def get_absolute_url(self): def get_absolute_url(self):
return ('document_page_view', [self.pk]) return ('document_page_view', [self.pk])
# Compatibility methods
@property
def document(self):
return self.document_version.document
class ArgumentsValidator(object): class ArgumentsValidator(object):
message = _(u'Enter a valid value.') message = _(u'Enter a valid value.')
@@ -421,10 +551,10 @@ class RecentDocument(models.Model):
# Register the fields that will be searchable # Register the fields that will be searchable
register('document', Document, _(u'document'), [ register('document', Document, _(u'document'), [
{'name': u'document_type__name', 'title': _(u'Document type')}, {'name': u'document_type__name', 'title': _(u'Document type')},
{'name': u'file_mimetype', 'title': _(u'MIME type')}, {'name': u'documentversion__mimetype', 'title': _(u'MIME type')},
{'name': u'file_filename', 'title': _(u'Filename')}, {'name': u'documentversion__filename', 'title': _(u'Filename')},
{'name': u'documentmetadata__value', 'title': _(u'Metadata value')}, {'name': u'documentmetadata__value', 'title': _(u'Metadata value')},
{'name': u'documentpage__content', 'title': _(u'Content')}, {'name': u'documentversion__documentpage__content', 'title': _(u'Content')},
{'name': u'description', 'title': _(u'Description')}, {'name': u'description', 'title': _(u'Description')},
{'name': u'tags__name', 'title': _(u'Tags')}, {'name': u'tags__name', 'title': _(u'Tags')},
{'name': u'comments__comment', 'title': _(u'Comments')}, {'name': u'comments__comment', 'title': _(u'Comments')},

View File

@@ -88,7 +88,7 @@ def do_document_ocr(queue_document):
parser, if the parser fails or if there is no parser registered for parser, if the parser fails or if there is no parser registered for
the document mimetype do a visual OCR by calling tesseract the document mimetype do a visual OCR by calling tesseract
""" """
for document_page in queue_document.document.documentpage_set.all(): for document_page in queue_document.document.pages.all():
try: try:
# Try to extract text by means of a parser # Try to extract text by means of a parser
parse_document_page(document_page) parse_document_page(document_page)