Added multipage document support and document page transformation

This commit is contained in:
Roberto Rosario
2011-02-14 00:18:16 -04:00
parent 65d1e5b176
commit 06d7e5a46a
21 changed files with 219 additions and 73 deletions

View File

View File

@@ -0,0 +1,3 @@
from django.conf import settings
TEMPORARY_DIRECTORY = getattr(settings, 'COMMON_TEMPORARY_DIRECTORY', u'/tmp')

View File

@@ -1,5 +1,5 @@
import tempfile import tempfile
from documents.conf import settings as documents_settings from common.conf import settings as common_settings
TEMPORARY_DIRECTORY = documents_settings.TEMPORARY_DIRECTORY if documents_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp() TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp()

View File

@@ -6,10 +6,8 @@ import shutil
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from documents.utils import from_descriptor_to_tempfile
from converter.conf.settings import CONVERT_PATH from converter.conf.settings import CONVERT_PATH
from converter.conf.settings import IDENTIFY_PATH
from converter.conf.settings import OCR_OPTIONS from converter.conf.settings import OCR_OPTIONS
from converter.conf.settings import DEFAULT_OPTIONS from converter.conf.settings import DEFAULT_OPTIONS
from converter.conf.settings import LOW_QUALITY_OPTIONS from converter.conf.settings import LOW_QUALITY_OPTIONS
@@ -18,6 +16,7 @@ from converter.conf.settings import HIGH_QUALITY_OPTIONS
#from converter.conf.settings import UNOCONV_PATH #from converter.conf.settings import UNOCONV_PATH
from converter import TEMPORARY_DIRECTORY from converter import TEMPORARY_DIRECTORY
from utils import from_descriptor_to_tempfile
QUALITY_DEFAULT = 'quality_default' QUALITY_DEFAULT = 'quality_default'
@@ -73,6 +72,16 @@ def execute_unoconv(input_filepath, output_filepath, arguments=''):
return (proc.wait(), proc.stderr.read()) return (proc.wait(), proc.stderr.read())
def execute_identify(input_filepath, arguments):
command = []
command.append(IDENTIFY_PATH)
command.extend(shlex.split(str(arguments)))
command.append(input_filepath)
proc = subprocess.Popen(command, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return (proc.wait(), proc.stderr.read(), proc.stdout.read())
def cache_cleanup(input_filepath, size, page=0, format='jpg'): def cache_cleanup(input_filepath, size, page=0, format='jpg'):
filepath = create_image_cache_filename(input_filepath, size, page, format) filepath = create_image_cache_filename(input_filepath, size, page, format)
try: try:
@@ -126,7 +135,6 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
try: try:
input_arg = '%s[%s]' % (input_filepath, page) input_arg = '%s[%s]' % (input_filepath, page)
extra_options += ' -resize %s' % size extra_options += ' -resize %s' % size
print 'extra_options', extra_options
status, error_string = execute_convert(input_arg, extra_options, '%s:%s' % (format, output_filepath), quality=quality) status, error_string = execute_convert(input_arg, extra_options, '%s:%s' % (format, output_filepath), quality=quality)
if status: if status:
errors = get_errors(error_string) errors = get_errors(error_string)
@@ -138,6 +146,15 @@ def convert(input_filepath, size, quality=QUALITY_DEFAULT, cache=True, page=0, f
return output_filepath return output_filepath
def get_page_count(input_filepath):
try:
status, error_string, output = execute_identify(input_filepath, '-format %n')
if status:
errors = get_errors(error_string)
raise ConvertError(status, errors)
finally:
return int(output)
#TODO: slugify OCR_OPTIONS and add to file name to cache #TODO: slugify OCR_OPTIONS and add to file name to cache
def convert_document_for_ocr(document, page=0, format='tif'): def convert_document_for_ocr(document, page=0, format='tif'):
#Extract document file #Extract document file

View File

@@ -5,6 +5,7 @@ ugettext = lambda s: s
CONVERT_PATH = getattr(settings, 'CONVERTER_CONVERT_PATH', u'/usr/bin/convert') CONVERT_PATH = getattr(settings, 'CONVERTER_CONVERT_PATH', u'/usr/bin/convert')
IDENTIFY_PATH = getattr(settings, 'CONVERTER_IDENTIFY_PATH', u'/usr/bin/identify')
OCR_OPTIONS = getattr(settings, 'CONVERTER_OCR_OPTIONS', u'-colorspace Gray -depth 8 -resample 200x200') OCR_OPTIONS = getattr(settings, 'CONVERTER_OCR_OPTIONS', u'-colorspace Gray -depth 8 -resample 200x200')
DEFAULT_OPTIONS = getattr(settings, 'CONVERTER_DEFAULT_OPTIONS', u'') DEFAULT_OPTIONS = getattr(settings, 'CONVERTER_DEFAULT_OPTIONS', u'')
LOW_QUALITY_OPTIONS = getattr(settings, 'CONVERTER_LOW_QUALITY_OPTIONS', u'') LOW_QUALITY_OPTIONS = getattr(settings, 'CONVERTER_LOW_QUALITY_OPTIONS', u'')

59
apps/converter/utils.py Normal file
View File

@@ -0,0 +1,59 @@
import os
import tempfile
from converter import TEMPORARY_DIRECTORY
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
def copyfile(source, dest, buffer_size=1024*1024):
"""
Copy a file from source to dest. source and dest
can either be strings or any object with a read or
write method, like StringIO for example.
"""
if not hasattr(source, 'read'):
source = open(source, 'rb')
if not hasattr(dest, 'write'):
dest = open(dest, 'wb')
while 1:
copy_buffer = source.read(buffer_size)
if copy_buffer:
dest.write(copy_buffer)
else:
break
source.close()
dest.close()
def from_descriptor_to_tempfile(input_descriptor, filename, buffer_size=1024*1024):
path = os.path.join(TEMPORARY_DIRECTORY, filename)
output_descriptor = open(path, 'wb')
while 1:
copy_buffer = input_descriptor.read(buffer_size)
if copy_buffer:
output_descriptor.write(copy_buffer)
else:
break
input_descriptor.close()
output_descriptor.close()
return path
def from_descriptor_to_new_tempfile(input_descriptor, buffer_size=1024*1024):
output_descriptor, tmp_filename = tempfile.mkstemp()
while 1:
copy_buffer = input_descriptor.read(buffer_size)
if copy_buffer:
#output_descriptor.write(copy_buffer)
os.write(output_descriptor, copy_buffer)
else:
break
input_descriptor.close()
os.close(output_descriptor)
return tmp_filename

View File

@@ -9,10 +9,10 @@ from common.utils import pretty_size
from permissions.api import register_permissions from permissions.api import register_permissions
from models import Document, DocumentTransformation from models import Document, DocumentPage, DocumentPageTransformation
from staging import StagingFile from staging import StagingFile
from documents.conf import settings as documents_settings from common.conf import settings as common_settings
PERMISSION_DOCUMENT_CREATE = 'document_create' PERMISSION_DOCUMENT_CREATE = 'document_create'
PERMISSION_DOCUMENT_PROPERTIES_EDIT = 'document_properties_edit' PERMISSION_DOCUMENT_PROPERTIES_EDIT = 'document_properties_edit'
@@ -43,18 +43,18 @@ document_edit_metadata = {'text':_('edit metadata'), 'view':'document_edit_metad
document_preview = {'text':_('preview'), 'class':'fancybox', 'view':'document_preview', 'args':'object.id', 'famfam':'magnifier', 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_VIEW]}} document_preview = {'text':_('preview'), 'class':'fancybox', 'view':'document_preview', 'args':'object.id', 'famfam':'magnifier', 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_VIEW]}}
document_download = {'text':_('download'), 'view':'document_download', 'args':'object.id', 'famfam':'page_save', 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_DOWNLOAD]}} document_download = {'text':_('download'), 'view':'document_download', 'args':'object.id', 'famfam':'page_save', 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_DOWNLOAD]}}
document_transformation_list = {'text':_(u'transformations'), 'view':'document_transformation_list', 'args':'object.id', 'famfam':'page_paintbrush', 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_TRANSFORM]}} #document_transformation_list = {'text':_(u'transformations'), 'view':'document_transformation_list', 'args':'object.id', 'famfam':'page_paintbrush', 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_TRANSFORM]}}
document_transformation_delete = {'text':_('delete'), 'view':'document_transformation_delete', 'args':'object.id', 'famfam':'delete'}#, 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_TRANSFORM]}} #document_transformation_delete = {'text':_('delete'), 'view':'document_transformation_delete', 'args':'object.id', 'famfam':'delete'}#, 'permissions':{'namespace':'documents', 'permissions':[PERMISSION_DOCUMENT_TRANSFORM]}}
staging_file_preview = {'text':_('preview'), 'class':'fancybox', 'view':'staging_file_preview', 'args':'object.id', 'famfam':'drive_magnify'} staging_file_preview = {'text':_('preview'), 'class':'fancybox', 'view':'staging_file_preview', 'args':'object.id', 'famfam':'drive_magnify'}
staging_file_delete = {'text':_('delete'), 'view':'staging_file_delete', 'args':'object.id', 'famfam':'drive_delete'} staging_file_delete = {'text':_('delete'), 'view':'staging_file_delete', 'args':'object.id', 'famfam':'drive_delete'}
register_links(Document, [document_view, document_edit, document_edit_metadata, document_delete, document_download, document_transformation_list], menu_name='sidebar') register_links(Document, [document_view, document_edit, document_edit_metadata, document_delete, document_download], menu_name='sidebar')
register_links(Document, [document_list, document_create, document_create_multiple, document_create_sibling], menu_name='sidebar') register_links(Document, [document_list, document_create, document_create_multiple, document_create_sibling], menu_name='sidebar')
register_links(['document_list', 'document_create', 'document_create_multiple', 'upload_document_with_type', 'upload_multiple_documents_with_type'], [document_list, document_create, document_create_multiple], menu_name='sidebar') register_links(['document_list', 'document_create', 'document_create_multiple', 'upload_document_with_type', 'upload_multiple_documents_with_type'], [document_list, document_create, document_create_multiple], menu_name='sidebar')
register_links(DocumentTransformation, [document_transformation_delete]) #register_links(DocumentTransformation, [document_transformation_delete])
@@ -76,4 +76,4 @@ register_menu([
document_list document_list
],'famfam':'page','position':4}]) ],'famfam':'page','position':4}])
TEMPORARY_DIRECTORY = documents_settings.TEMPORARY_DIRECTORY if documents_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp() TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp()

View File

@@ -3,7 +3,7 @@ from django.contrib import admin
from models import MetadataType, DocumentType, Document, \ from models import MetadataType, DocumentType, Document, \
DocumentTypeMetadataType, DocumentMetadata, DocumentTypeFilename, \ DocumentTypeMetadataType, DocumentMetadata, DocumentTypeFilename, \
MetadataIndex, DocumentMetadataIndex, DocumentPage, MetadataGroup, \ MetadataIndex, DocumentMetadataIndex, DocumentPage, MetadataGroup, \
MetadataGroupItem, DocumentTransformation MetadataGroupItem, DocumentPageTransformation
class MetadataTypeAdmin(admin.ModelAdmin): class MetadataTypeAdmin(admin.ModelAdmin):
@@ -48,7 +48,11 @@ class DocumentMetadataIndexInline(admin.StackedInline):
extra = 1 extra = 1
classes = ('collapse-open',) classes = ('collapse-open',)
allow_add = True allow_add = True
readonly_fields = ('metadata_index', 'filename') readonly_fields = ('suffix', 'metadata_index', 'filename')
class DocumentPageTransformationAdmin(admin.ModelAdmin):
model = DocumentPageTransformation
class DocumentPageInline(admin.StackedInline): class DocumentPageInline(admin.StackedInline):
@@ -58,16 +62,9 @@ class DocumentPageInline(admin.StackedInline):
allow_add = True allow_add = True
class DocumentTransformationline(admin.StackedInline):
model = DocumentTransformation
extra = 1
classes = ('collapse-open',)
allow_add = True
class DocumentAdmin(admin.ModelAdmin): class DocumentAdmin(admin.ModelAdmin):
inlines = [DocumentMetadataInline, DocumentMetadataIndexInline, inlines = [DocumentMetadataInline, DocumentMetadataIndexInline,
DocumentTransformationline, DocumentPageInline] DocumentPageInline]
list_display = ('uuid', 'file_filename', 'file_extension') list_display = ('uuid', 'file_filename', 'file_extension')
@@ -87,4 +84,5 @@ admin.site.register(MetadataType, MetadataTypeAdmin)
admin.site.register(DocumentType, DocumentTypeAdmin) admin.site.register(DocumentType, DocumentTypeAdmin)
admin.site.register(Document, DocumentAdmin) admin.site.register(Document, DocumentAdmin)
admin.site.register(MetadataGroup, MetadataGroupAdmin) admin.site.register(MetadataGroup, MetadataGroupAdmin)
admin.site.register(DocumentPageTransformation, DocumentPageTransformationAdmin)

View File

@@ -1,10 +1,13 @@
import datetime import datetime
import hashlib import hashlib
import uuid import uuid
import tempfile
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
from converter.api import get_page_count
from documents.storage import DocumentStorage from documents.storage import DocumentStorage
default_available_functions = { default_available_functions = {
@@ -29,6 +32,7 @@ DELETE_LOCAL_ORIGINAL = getattr(settings, 'DOCUMENTS_DELETE_LOCAL_ORIGINAL', Fal
# Saving # Saving
CHECKSUM_FUNCTION = getattr(settings, 'DOCUMENTS_CHECKSUM_FUNCTION', lambda x: hashlib.sha256(x).hexdigest()) CHECKSUM_FUNCTION = getattr(settings, 'DOCUMENTS_CHECKSUM_FUNCTION', lambda x: hashlib.sha256(x).hexdigest())
UUID_FUNCTION = getattr(settings, 'DOCUMENTS_UUID_FUNCTION', lambda:unicode(uuid.uuid4())) UUID_FUNCTION = getattr(settings, 'DOCUMENTS_UUID_FUNCTION', lambda:unicode(uuid.uuid4()))
PAGE_COUNT_FUNCTION = getattr(settings, 'DOCUMENTS_PAGE_COUNT_FUNCTION', lambda x: get_page_count(x.save_to_file(tempfile.mkstemp()[1])))
# Storage # Storage
STORAGE_BACKEND = getattr(settings, 'DOCUMENTS_STORAGE_BACKEND', DocumentStorage) STORAGE_BACKEND = getattr(settings, 'DOCUMENTS_STORAGE_BACKEND', DocumentStorage)
@@ -36,6 +40,7 @@ STORAGE_DIRECTORY_NAME = getattr(settings, 'DOCUMENTS_STORAGE_DIRECTORY_NAME', '
# Usage # Usage
PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_PREVIEW_SIZE', '640x480') PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_PREVIEW_SIZE', '640x480')
MULTIPAGE_PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_MULTIPAGE_PREVIEW_SIZE', '160x120')
THUMBNAIL_SIZE = getattr(settings, 'DOCUMENTS_THUMBNAIL_SIZE', '50x50') THUMBNAIL_SIZE = getattr(settings, 'DOCUMENTS_THUMBNAIL_SIZE', '50x50')
DISPLAY_SIZE = getattr(settings, 'DOCUMENTS_DISPLAY_SIZE', '1200') DISPLAY_SIZE = getattr(settings, 'DOCUMENTS_DISPLAY_SIZE', '1200')
@@ -48,8 +53,3 @@ FILESYSTEM_FILESERVING_ENABLE = getattr(settings, 'DOCUMENTS_FILESYSTEM_FILESERV
FILESYSTEM_FILESERVING_PATH = getattr(settings, 'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', u'/tmp/mayan/documents') FILESYSTEM_FILESERVING_PATH = getattr(settings, 'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', u'/tmp/mayan/documents')
FILESYSTEM_SLUGIFY_PATHS = getattr(settings, 'DOCUMENTS_SLUGIFY_PATHS', False) FILESYSTEM_SLUGIFY_PATHS = getattr(settings, 'DOCUMENTS_SLUGIFY_PATHS', False)
FILESYSTEM_MAX_RENAME_COUNT = getattr(settings, 'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 200) FILESYSTEM_MAX_RENAME_COUNT = getattr(settings, 'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 200)
#misc
TEMPORARY_DIRECTORY = getattr(settings, 'DOCUMENTS_TEMPORARY_DIRECTORY', u'/tmp')

View File

@@ -24,8 +24,22 @@ from documents.conf.settings import AVAILABLE_MODELS
class ImageWidget(forms.widgets.Widget): class ImageWidget(forms.widgets.Widget):
def render(self, name, value, attrs=None): def render(self, name, value, attrs=None):
output = [] output = []
output.append('<a class="fancybox-noscaling" href="%s"><img width="300" src="%s" /></a>' % (reverse('document_display', args=[value.id]),
reverse('document_preview', args=[value.id]))) page_count = value.documentpage_set.count()
if page_count > 1:
output.append('<br /><span class="famfam active famfam-page_white_copy"></span>%s<br />' % ugettext(u'Pages'))
for page_index in range(value.documentpage_set.count()):
output.append('<span>%(page)s)<a rel="gallery_1" class="fancybox-noscaling" href="%(url)s?page=%(page)s"><img src="%(img)s?page=%(page)s" /></a></span>' % {
'url':reverse('document_display', args=[value.id]),
'img':reverse('document_preview_multipage', args=[value.id]),
'page':page_index+1,
})
else:
output.append('<a class="fancybox-noscaling" href="%(url)s"><img width="300" src="%(img)s" /></a>' % {
'url':reverse('document_display', args=[value.id]),
'img':reverse('document_preview', args=[value.id]),
})
output.append('<br /><span class="famfam active famfam-magnifier"></span>%s' % ugettext(u'Click on the image for full size view')) output.append('<br /><span class="famfam active famfam-magnifier"></span>%s' % ugettext(u'Click on the image for full size view'))
#output.append(super(ImageWidget, self).render(name, value, attrs)) #output.append(super(ImageWidget, self).render(name, value, attrs))
return mark_safe(u''.join(output)) return mark_safe(u''.join(output))
@@ -58,7 +72,7 @@ class DocumentPreviewForm(forms.Form):
super(DocumentPreviewForm, self).__init__(*args, **kwargs) super(DocumentPreviewForm, self).__init__(*args, **kwargs)
self.fields['preview'].initial = self.document self.fields['preview'].initial = self.document
preview = forms.CharField(widget=ImageWidget) preview = forms.CharField(widget=ImageWidget())
class DocumentForm_view(DetailForm): class DocumentForm_view(DetailForm):

View File

@@ -20,6 +20,7 @@ from documents.conf.settings import AVAILABLE_FUNCTIONS
from documents.conf.settings import AVAILABLE_MODELS from documents.conf.settings import AVAILABLE_MODELS
from documents.conf.settings import CHECKSUM_FUNCTION from documents.conf.settings import CHECKSUM_FUNCTION
from documents.conf.settings import UUID_FUNCTION from documents.conf.settings import UUID_FUNCTION
from documents.conf.settings import PAGE_COUNT_FUNCTION
from documents.conf.settings import STORAGE_BACKEND from documents.conf.settings import STORAGE_BACKEND
from documents.conf.settings import STORAGE_DIRECTORY_NAME from documents.conf.settings import STORAGE_DIRECTORY_NAME
from documents.conf.settings import FILESYSTEM_FILESERVING_ENABLE from documents.conf.settings import FILESYSTEM_FILESERVING_ENABLE
@@ -74,12 +75,15 @@ class Document(models.Model):
verbose_name_plural = _(u'documents') verbose_name_plural = _(u'documents')
ordering = ['-date_added'] ordering = ['-date_added']
def __unicode__(self): def __unicode__(self):
return '%s.%s' % (self.file_filename, self.file_extension) return '%s.%s' % (self.file_filename, self.file_extension)
def get_fullname(self): def get_fullname(self):
return os.extsep.join([self.file_filename, self.file_extension]) return os.extsep.join([self.file_filename, self.file_extension])
def update_mimetype(self): def update_mimetype(self):
try: try:
mime = magic.Magic(mime=True) mime = magic.Magic(mime=True)
@@ -95,25 +99,52 @@ class Document(models.Model):
def read(self, count=1024): def read(self, count=1024):
return self.file.storage.open(self.file.url).read(count) return self.file.storage.open(self.file.url).read(count)
@models.permalink @models.permalink
def get_absolute_url(self): def get_absolute_url(self):
return ('document_view', [self.id]) return ('document_view', [self.id])
def update_checksum(self, save=True): def update_checksum(self, save=True):
if self.exists(): if self.exists():
self.checksum = unicode(CHECKSUM_FUNCTION(self.file.read())) self.checksum = unicode(CHECKSUM_FUNCTION(self.file.read()))
if save: if save:
self.save() self.save()
def update_page_count(self):
total_pages = PAGE_COUNT_FUNCTION(self)
for page_number in range(total_pages):
document_page, created = DocumentPage.objects.get_or_create(
document=self, page_number=page_number+1)
def save_to_file(self, filepath, buffer_size=1024*1024):
storage = self.file.storage.open(self.file.url)
output_descriptor = open(filepath, 'wb')
while 1:
copy_buffer = storage.read()
if copy_buffer:
output_descriptor.write(copy_buffer)
else:
break
#input_descriptor.close()
output_descriptor.close()
return filepath
def exists(self): def exists(self):
return self.file.storage.exists(self.file.url) return self.file.storage.exists(self.file.url)
def delete(self, *args, **kwargs): def delete(self, *args, **kwargs):
#TODO: Might not execute when done in bulk from a queryset #TODO: Might not execute when done in bulk from a queryset
#topics/db/queries.html#topics-db-queries-delete #topics/db/queries.html#topics-db-queries-delete
self.delete_fs_links() self.delete_fs_links()
super(Document, self).delete(*args, **kwargs) super(Document, self).delete(*args, **kwargs)
def get_metadata_groups(self): def get_metadata_groups(self):
errors = [] errors = []
metadata_groups = {} metadata_groups = {}
@@ -144,6 +175,7 @@ class Document(models.Model):
metadata_groups[group] = Document.objects.filter(Q(id__in=document_id_list) & ~Q(id=self.id)) or [] metadata_groups[group] = Document.objects.filter(Q(id__in=document_id_list) & ~Q(id=self.id)) or []
return metadata_groups, errors return metadata_groups, errors
def create_fs_links(self): def create_fs_links(self):
if FILESYSTEM_FILESERVING_ENABLE: if FILESYSTEM_FILESERVING_ENABLE:
if not self.exists(): if not self.exists():
@@ -171,6 +203,7 @@ class Document(models.Model):
#This should be a warning not an error #This should be a warning not an error
pass pass
def delete_fs_links(self): def delete_fs_links(self):
if FILESYSTEM_FILESERVING_ENABLE: if FILESYSTEM_FILESERVING_ENABLE:
for document_metadata_index in self.documentmetadataindex_set.all(): for document_metadata_index in self.documentmetadataindex_set.all():
@@ -209,12 +242,14 @@ class Document(models.Model):
except OSError, exc: except OSError, exc:
pass pass
#Remove the directory if it is empty #Remove the directory if it is empty
try: try:
os.removedirs(path) os.removedirs(path)
except: except:
pass pass
def next_available_filename(document, metadata_index, path, filename, extension, suffix=0): def next_available_filename(document, metadata_index, path, filename, extension, suffix=0):
target = filename target = filename
if suffix: if suffix:
@@ -344,10 +379,10 @@ class DocumentPage(models.Model):
document = models.ForeignKey(Document, verbose_name=_(u'document')) document = models.ForeignKey(Document, verbose_name=_(u'document'))
content = models.TextField(blank=True, null=True, verbose_name=_(u'content')) content = models.TextField(blank=True, null=True, verbose_name=_(u'content'))
page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label')) page_label = models.CharField(max_length=32, blank=True, null=True, verbose_name=_(u'page label'))
page_number = models.PositiveIntegerField(default=0, verbose_name=_(u'page number')) page_number = models.PositiveIntegerField(default=1, editable=False, verbose_name=_(u'page number'))
def __unicode__(self): def __unicode__(self):
return '%s - %s' % (self.page_number, self.page_label) return '%s - %s - %s' % (self.document, self.page_number, self.page_label)
class Meta: class Meta:
verbose_name = _(u'document page') verbose_name = _(u'document page')
@@ -377,7 +412,7 @@ INCLUSION_CHOICES = (
(INCLUSION_OR, _(u'or')), (INCLUSION_OR, _(u'or')),
) )
OPERATOR_CHOCIES = ( OPERATOR_CHOICES = (
('exact', _(u'is equal')), ('exact', _(u'is equal')),
('iexact', _(u'is equal (case insensitive)')), ('iexact', _(u'is equal (case insensitive)')),
('contains', _(u'contains')), ('contains', _(u'contains')),
@@ -399,7 +434,7 @@ class MetadataGroupItem(models.Model):
metadata_group = models.ForeignKey(MetadataGroup, verbose_name=_(u'metadata group')) metadata_group = models.ForeignKey(MetadataGroup, verbose_name=_(u'metadata group'))
inclusion = models.CharField(default=INCLUSION_AND, max_length=16, choices=INCLUSION_CHOICES, help_text=_(u'The inclusion is ignored for the first item.')) inclusion = models.CharField(default=INCLUSION_AND, max_length=16, choices=INCLUSION_CHOICES, help_text=_(u'The inclusion is ignored for the first item.'))
metadata_type = models.ForeignKey(MetadataType, verbose_name=_(u'metadata type'), help_text=_(u'This represents the metadata of all other documents.')) metadata_type = models.ForeignKey(MetadataType, verbose_name=_(u'metadata type'), help_text=_(u'This represents the metadata of all other documents.'))
operator = models.CharField(max_length=16, choices=OPERATOR_CHOCIES) operator = models.CharField(max_length=16, choices=OPERATOR_CHOICES)
expression = models.CharField(max_length=128, expression = models.CharField(max_length=128,
verbose_name=_(u'expression'), help_text=_(u'This expression will be evaluated against the current seleted document. The document metadata is available as variables of the same name but with the "metadata_" prefix added their name.')) verbose_name=_(u'expression'), help_text=_(u'This expression will be evaluated against the current seleted document. The document metadata is available as variables of the same name but with the "metadata_" prefix added their name.'))
negated = models.BooleanField(default=False, verbose_name=_(u'negated'), help_text=_(u'Inverts the logic of the operator.')) negated = models.BooleanField(default=False, verbose_name=_(u'negated'), help_text=_(u'Inverts the logic of the operator.'))
@@ -413,8 +448,8 @@ class MetadataGroupItem(models.Model):
verbose_name_plural = _(u'metadata group items') verbose_name_plural = _(u'metadata group items')
class DocumentTransformation(models.Model): class DocumentPageTransformation(models.Model):
document = models.ForeignKey(Document, verbose_name=_(u'document')) document_page = models.ForeignKey(DocumentPage, verbose_name=_(u'document page'))
order = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'order')) order = models.PositiveIntegerField(blank=True, null=True, verbose_name=_(u'order'))
transformation = models.CharField(choices=TRANFORMATION_CHOICES, max_length=128, verbose_name=_(u'transformation')) transformation = models.CharField(choices=TRANFORMATION_CHOICES, max_length=128, verbose_name=_(u'transformation'))
arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use directories to indentify arguments, example: {\'degrees\':90}')) arguments = models.TextField(blank=True, null=True, verbose_name=_(u'arguments'), help_text=_(u'Use directories to indentify arguments, example: {\'degrees\':90}'))
@@ -430,10 +465,8 @@ class DocumentTransformation(models.Model):
class Meta: class Meta:
ordering = ('order',) ordering = ('order',)
verbose_name = _(u'document transformation') verbose_name = _(u'document page transformation')
verbose_name_plural = _(u'document transformations') verbose_name_plural = _(u'document page transformations')
register(Document, _(u'document'), ['document_type__name', 'file_mimetype', 'file_filename', 'file_extension', 'documentmetadata__value', 'documentpage__content']) register(Document, _(u'document'), ['document_type__name', 'file_mimetype', 'file_filename', 'file_extension', 'documentmetadata__value', 'documentpage__content'])

View File

@@ -5,6 +5,7 @@ from django.views.generic.create_update import create_object, update_object
from documents.conf.settings import PREVIEW_SIZE from documents.conf.settings import PREVIEW_SIZE
from documents.conf.settings import THUMBNAIL_SIZE from documents.conf.settings import THUMBNAIL_SIZE
from documents.conf.settings import DISPLAY_SIZE from documents.conf.settings import DISPLAY_SIZE
from documents.conf.settings import MULTIPAGE_PREVIEW_SIZE
from converter.api import QUALITY_HIGH from converter.api import QUALITY_HIGH
@@ -19,8 +20,9 @@ urlpatterns = patterns('documents.views',
url(r'^document/(?P<document_id>\d+)/delete/$', 'document_delete', (), 'document_delete'), url(r'^document/(?P<document_id>\d+)/delete/$', 'document_delete', (), 'document_delete'),
url(r'^document/(?P<document_id>\d+)/edit/$', 'document_edit', (), 'document_edit'), url(r'^document/(?P<document_id>\d+)/edit/$', 'document_edit', (), 'document_edit'),
url(r'^document/(?P<document_id>\d+)/edit/metadata/$', 'document_edit_metadata', (), 'document_edit_metadata'), url(r'^document/(?P<document_id>\d+)/edit/metadata/$', 'document_edit_metadata', (), 'document_edit_metadata'),
url(r'^document/(?P<document_id>\d+)/preview/$', 'get_document_image', {'size':PREVIEW_SIZE}, 'document_preview'), url(r'^document/(?P<document_id>\d+)/display/preview/$', 'get_document_image', {'size':PREVIEW_SIZE}, 'document_preview'),
url(r'^document/(?P<document_id>\d+)/thumbnail/$', 'get_document_image', {'size':THUMBNAIL_SIZE}, 'document_thumbnail'), url(r'^document/(?P<document_id>\d+)/display/preview/multipage/$', 'get_document_image', {'size':MULTIPAGE_PREVIEW_SIZE}, 'document_preview_multipage'),
url(r'^document/(?P<document_id>\d+)/display/thumbnail/$', 'get_document_image', {'size':THUMBNAIL_SIZE}, 'document_thumbnail'),
url(r'^document/(?P<document_id>\d+)/display/$', 'get_document_image', {'size':DISPLAY_SIZE,'quality':QUALITY_HIGH}, 'document_display'), url(r'^document/(?P<document_id>\d+)/display/$', 'get_document_image', {'size':DISPLAY_SIZE,'quality':QUALITY_HIGH}, 'document_display'),
url(r'^document/(?P<document_id>\d+)/download/$', 'document_download', (), 'document_download'), url(r'^document/(?P<document_id>\d+)/download/$', 'document_download', (), 'document_download'),
url(r'^document/(?P<document_id>\d+)/create/siblings/$', 'document_create_sibling', {'multiple':False}, 'document_create_sibling'), url(r'^document/(?P<document_id>\d+)/create/siblings/$', 'document_create_sibling', {'multiple':False}, 'document_create_sibling'),

View File

@@ -49,7 +49,6 @@ def from_descriptor_to_tempfile(input_descriptor, filename, buffer_size=1024*102
return path return path
def from_descriptor_to_new_tempfile(input_descriptor, buffer_size=1024*1024): def from_descriptor_to_new_tempfile(input_descriptor, buffer_size=1024*1024):
output_descriptor, tmp_filename = tempfile.mkstemp() output_descriptor, tmp_filename = tempfile.mkstemp()

View File

@@ -18,7 +18,8 @@ from common.utils import pretty_size
from utils import from_descriptor_to_tempfile from utils import from_descriptor_to_tempfile
from models import Document, DocumentMetadata, DocumentType, MetadataType from models import Document, DocumentMetadata, DocumentType, MetadataType, \
DocumentPage
from forms import DocumentTypeSelectForm, DocumentCreateWizard, \ from forms import DocumentTypeSelectForm, DocumentCreateWizard, \
MetadataForm, DocumentForm, DocumentForm_edit, DocumentForm_view, \ MetadataForm, DocumentForm, DocumentForm_edit, DocumentForm_view, \
StagingDocumentForm, DocumentTypeMetadataType, DocumentPreviewForm, \ StagingDocumentForm, DocumentTypeMetadataType, DocumentPreviewForm, \
@@ -122,6 +123,8 @@ def upload_document_with_type(request, document_type_id, multiple=True):
instance = local_form.save() instance = local_form.save()
instance.update_checksum() instance.update_checksum()
instance.update_mimetype() instance.update_mimetype()
instance.update_page_count()
if 'document_type_available_filenames' in local_form.cleaned_data: if 'document_type_available_filenames' in local_form.cleaned_data:
if local_form.cleaned_data['document_type_available_filenames']: if local_form.cleaned_data['document_type_available_filenames']:
instance.file_filename = local_form.cleaned_data['document_type_available_filenames'].filename instance.file_filename = local_form.cleaned_data['document_type_available_filenames'].filename
@@ -154,6 +157,7 @@ def upload_document_with_type(request, document_type_id, multiple=True):
document.save() document.save()
document.update_checksum() document.update_checksum()
document.update_mimetype() document.update_mimetype()
document.update_page_count()
except Exception, e: except Exception, e:
messages.error(request, e) messages.error(request, e)
else: else:
@@ -243,6 +247,7 @@ def document_view(request, document_id):
{'label':_(u'Time added'), 'field':lambda x: unicode(x.date_added.time()).split('.')[0]}, {'label':_(u'Time added'), 'field':lambda x: unicode(x.date_added.time()).split('.')[0]},
{'label':_(u'Checksum'), 'field':'checksum'}, {'label':_(u'Checksum'), 'field':'checksum'},
{'label':_(u'UUID'), 'field':'uuid'}, {'label':_(u'UUID'), 'field':'uuid'},
{'label':_(u'Pages'), 'field':lambda x: x.documentpage_set.count()},
]) ])
@@ -436,27 +441,33 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_
raise Http404(e) raise Http404(e)
document = get_object_or_404(Document, pk=document_id) document = get_object_or_404(Document, pk=document_id)
page = int(request.GET.get('page', 1))
transformation_list = [] transformation_list = []
for tranformation in document.documenttransformation_set.all(): try:
try: document_page = DocumentPage.objects.get(document=document, page_number=page)
transformation_list.append(tranformation.get_transformation())
except Exception, e: for tranformation in document_page.documentpagetransformation_set.all():
if request.user.is_staff: try:
messages.warning(request, _(u'Transformation %s error: %s' % (tranformation, e))) transformation_list.append(tranformation.get_transformation())
else: except Exception, e:
pass if request.user.is_staff:
messages.warning(request, _(u'Transformation %s error: %s' % (tranformation, e)))
else:
pass
except:
pass
tranformation_string = ' '.join(transformation_list) tranformation_string = ' '.join(transformation_list)
try: try:
filepath = in_image_cache(document.checksum, size=size, quality=quality, extra_options=tranformation_string) filepath = in_image_cache(document.checksum, size=size, quality=quality, extra_options=tranformation_string, page=page-1)
if filepath: if filepath:
return serve_file(request, File(file=open(filepath, 'r'))) return serve_file(request, File(file=open(filepath, 'r')))
#Save to a temporary location #Save to a temporary location
document.file.open() document.file.open()
desc = document.file.storage.open(document.file.path) desc = document.file.storage.open(document.file.path)
filepath = from_descriptor_to_tempfile(desc, document.checksum) filepath = from_descriptor_to_tempfile(desc, document.checksum)
output_file = convert(filepath, size=size, format='jpg', quality=quality, extra_options=tranformation_string) output_file = convert(filepath, size=size, format='jpg', quality=quality, extra_options=tranformation_string, page=page-1)
return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg') return serve_file(request, File(file=open(output_file, 'r')), content_type='image/jpeg')
except Exception, e: except Exception, e:
if size == THUMBNAIL_SIZE: if size == THUMBNAIL_SIZE:
@@ -523,6 +534,7 @@ def document_transformation_list(request, document_id):
document = get_object_or_404(Document, pk=document_id) document = get_object_or_404(Document, pk=document_id)
return object_list( return object_list(
request, request,
queryset=document.documenttransformation_set.all(), queryset=document.documenttransformation_set.all(),
@@ -539,9 +551,9 @@ def document_transformation_delete(request, document_transformation_id):
except Unauthorized, e: except Unauthorized, e:
raise Http404(e) raise Http404(e)
document_transformation = get_object_or_404(DocumentTransformation, pk=document_transformation_id) document_transformation = get_object_or_404(DocumentPageTransformation, pk=document_transformation_id)
return delete_object(request, model=DocumentTransformation, object_id=document_transformation_id, return delete_object(request, model=DocumentPageTransformation, object_id=document_transformation_id,
template_name='generic_confirm.html', template_name='generic_confirm.html',
post_delete_redirect=reverse('document_transformation_list'), post_delete_redirect=reverse('document_transformation_list'),
extra_context={ extra_context={

View File

@@ -5,6 +5,7 @@ from django.shortcuts import render_to_response
from django.template import RequestContext from django.template import RequestContext
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
from common.conf import settings as common_settings
from documents.conf import settings as documents_settings from documents.conf import settings as documents_settings
from converter.conf import settings as converter_settings from converter.conf import settings as converter_settings
from ocr.conf import settings as ocr_settings from ocr.conf import settings as ocr_settings
@@ -34,7 +35,9 @@ def check_settings(request):
{'name':'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', 'value':documents_settings.FILESYSTEM_FILESERVING_PATH, 'exists':True}, {'name':'DOCUMENTS_FILESYSTEM_FILESERVING_PATH', 'value':documents_settings.FILESYSTEM_FILESERVING_PATH, 'exists':True},
{'name':'DOCUMENTS_SLUGIFY_PATHS', 'value':documents_settings.FILESYSTEM_SLUGIFY_PATHS}, {'name':'DOCUMENTS_SLUGIFY_PATHS', 'value':documents_settings.FILESYSTEM_SLUGIFY_PATHS},
{'name':'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 'value':documents_settings.FILESYSTEM_MAX_RENAME_COUNT}, {'name':'DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT', 'value':documents_settings.FILESYSTEM_MAX_RENAME_COUNT},
{'name':'DOCUMENTS_TEMPORARY_DIRECTORY', 'value':documents_settings.TEMPORARY_DIRECTORY, 'exists':True},
#Common
{'name':'COMMON_TEMPORARY_DIRECTORY', 'value':common_settings.TEMPORARY_DIRECTORY, 'exists':True},
#Converter #Converter
{'name':'CONVERTER_CONVERT_PATH', 'value':converter_settings.CONVERT_PATH, 'exists':True}, {'name':'CONVERTER_CONVERT_PATH', 'value':converter_settings.CONVERT_PATH, 'exists':True},

View File

@@ -5,13 +5,13 @@ from permissions.api import register_permissions
from documents.models import Document from documents.models import Document
OCR_DOCUMENT_OCR = 'document_ocr' PERMISSION_OCR_DOCUMENT = 'ocr_document'
register_permissions('ocr', [ register_permissions('ocr', [
{'name':OCR_DOCUMENT_OCR, 'label':_(u'Submit document for OCR')}, {'name':PERMISSION_OCR_DOCUMENT, 'label':_(u'Submit document for OCR')},
]) ])
submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning', 'permissions':{'namespace':'ocr', 'permissions':[OCR_DOCUMENT_OCR]}} submit_document = {'text':_('submit to OCR queue'), 'view':'submit_document', 'args':'object.id', 'famfam':'page_lightning', 'permissions':{'namespace':'ocr', 'permissions':[PERMISSION_OCR_DOCUMENT]}}
register_links(Document, [submit_document], menu_name='sidebar') register_links(Document, [submit_document], menu_name='sidebar')

View File

@@ -8,7 +8,7 @@ import tempfile
from django.utils.translation import ugettext as _ from django.utils.translation import ugettext as _
from documents.models import DocumentPage from documents.models import DocumentPage
from documents.conf.settings import TEMPORARY_DIRECTORY from common.conf.settings import TEMPORARY_DIRECTORY
from converter.api import convert_document_for_ocr from converter.api import convert_document_for_ocr
from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_PATH

View File

@@ -11,11 +11,11 @@ from django.utils.translation import ugettext as _
from permissions.api import check_permissions, Unauthorized from permissions.api import check_permissions, Unauthorized
from documents.models import Document from documents.models import Document
from ocr import OCR_DOCUMENT_OCR from ocr import PERMISSION_OCR_DOCUMENT
from api import ocr_document from api import ocr_document
def submit_document(request, document_id): def submit_document(request, document_id):
permissions = [OCR_DOCUMENT_OCR] permissions = [PERMISSION_OCR_DOCUMENT]
try: try:
check_permissions(request.user, 'ocr', permissions) check_permissions(request.user, 'ocr', permissions)
except Unauthorized, e: except Unauthorized, e:

View File

@@ -6,3 +6,6 @@
* Added the ability to group documents by their metadata * Added the ability to group documents by their metadata
* New abstracted options to adjust document conversion quality (default, low, high) * New abstracted options to adjust document conversion quality (default, low, high)
* Added permissions and roles support * Added permissions and roles support
* Added multipage documents support (only tested on pdfs)
To update a previous database do: [d.update_page_count() for d in Document.objects.all()]
* Added support for document page transformation (no GUI yet)

View File

@@ -29,6 +29,8 @@
* Permissions - DONE * Permissions - DONE
* Roles - DONE * Roles - DONE
* Assign default role to new users - DONE * Assign default role to new users - DONE
* DB stored transformations - DONE
* Recognize multi-page documents - DONE
* Document list filtering by metadata * Document list filtering by metadata
* Filterform date filtering widget * Filterform date filtering widget
* Validate GET data before saving file * Validate GET data before saving file
@@ -49,7 +51,6 @@
* Scheduled maintenance (cleanup, deferred OCR's) * Scheduled maintenance (cleanup, deferred OCR's)
* Add tags to documents * Add tags to documents
* Field for document language or autodetect * Field for document language or autodetect
* Recognize multi-page documents
* Count pages in a PDF file http://pybrary.net/pyPdf/ * Count pages in a PDF file http://pybrary.net/pyPdf/
* Download a document in diffent formats: (jpg, png, pdf) * Download a document in diffent formats: (jpg, png, pdf)
* Cache.cleanup function to delete cached images when document hash changes * Cache.cleanup function to delete cached images when document hash changes
@@ -67,6 +68,5 @@
* Download metadata group documents as a single zip file * Download metadata group documents as a single zip file
* Download original document or transformed document * Download original document or transformed document
* Include annotations in transformed documents downloads * Include annotations in transformed documents downloads
* DB stored transformations
* Document view temp transformations * Document view temp transformations
* Implement permissions decorators * Implement permissions decorators

View File

@@ -191,6 +191,7 @@ LOGIN_EXEMPT_URLS = (
#DOCUMENTS_PREVIEW_SIZE = '640x480' #DOCUMENTS_PREVIEW_SIZE = '640x480'
#DOCUMENTS_THUMBNAIL_SIZE = '50x50' #DOCUMENTS_THUMBNAIL_SIZE = '50x50'
#DOCUMENTS_DISPLAY_SIZE = '1200' #DOCUMENTS_DISPLAY_SIZE = '1200'
#DOCUMENTS_MULTIPAGE_PREVIEW_SIZE = '160x120'
# Groups # Groups
#DOCUMENTS_GROUP_MAX_RESULTS = 20 #DOCUMENTS_GROUP_MAX_RESULTS = 20
@@ -203,7 +204,7 @@ LOGIN_EXEMPT_URLS = (
#DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT = 200 #DOCUMENTS_FILESYSTEM_MAX_RENAME_COUNT = 200
# Misc # Misc
#DOCUMENTS_TEMPORARY_DIRECTORY = u'/tmp' #COMMON_TEMPORARY_DIRECTORY = u'/tmp'
# Converter # Converter
#CONVERTER_DEFAULT_OPTIONS = u'' #CONVERTER_DEFAULT_OPTIONS = u''
@@ -211,6 +212,7 @@ LOGIN_EXEMPT_URLS = (
#CONVERTER_HIGH_QUALITY_OPTIONS = u'-density 400' #CONVERTER_HIGH_QUALITY_OPTIONS = u'-density 400'
#CONVERTER_CONVERT_PATH = u'/usr/bin/convert' #CONVERTER_CONVERT_PATH = u'/usr/bin/convert'
#CONVERTER_OCR_OPTIONS = u'-colorspace Gray -depth 8 -resample 200x200' #CONVERTER_OCR_OPTIONS = u'-colorspace Gray -depth 8 -resample 200x200'
#CONVERTER_IDENTIFY_PATH = u'/usr/bin/identify'
# OCR # OCR
#OCR_TESSERACT_PATH = u'/usr/bin/tesseract' #OCR_TESSERACT_PATH = u'/usr/bin/tesseract'