diff --git a/apps/documents/conf/settings.py b/apps/documents/conf/settings.py index e95600a785..bcd96e54e8 100755 --- a/apps/documents/conf/settings.py +++ b/apps/documents/conf/settings.py @@ -36,6 +36,8 @@ STAGING_FILES_PREVIEW_SIZE = getattr(settings, 'DOCUMENTS_STAGING_FILES_PREVIEW_ DELETE_LOCAL_ORIGINAL = getattr(settings, 'DOCUMENTS_DELETE_LOCAL_ORIGINAL', False) AUTOMATIC_OCR = getattr(settings, 'DOCUMENTS_AUTOMATIC_OCR', False) ENABLE_SINGLE_DOCUMENT_UPLOAD = getattr(settings, 'DOCUMENTS_ENABLE_SINGLE_DOCUMENT_UPLOAD', True) +UNCOMPRESS_COMPRESSED_LOCAL_FILES = getattr(settings, 'DOCUMENTS_UNCOMPRESS_COMPRESSED_LOCAL_FILES', True) +UNCOMPRESS_COMPRESSED_STAGING_FILES = getattr(settings, 'DOCUMENTS_UNCOMPRESS_COMPRESSED_STAGING_FILES', True) # Saving CHECKSUM_FUNCTION = getattr(settings, 'DOCUMENTS_CHECKSUM_FUNCTION', lambda x: hashlib.sha256(x).hexdigest()) diff --git a/apps/documents/views.py b/apps/documents/views.py index 77eaf64b85..f112fbb573 100755 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -1,3 +1,5 @@ +import zipfile + from django.utils.translation import ugettext as _ from django.http import HttpResponse, HttpResponseRedirect, Http404 from django.shortcuts import render_to_response, get_object_or_404, redirect @@ -11,6 +13,7 @@ from django.conf import settings from django.utils.http import urlencode from django.template.defaultfilters import slugify from django.core.exceptions import ObjectDoesNotExist +from django.core.files.uploadedfile import SimpleUploadedFile from common.utils import pretty_size from permissions.api import check_permissions, Unauthorized @@ -42,6 +45,8 @@ from documents.conf.settings import GROUP_MAX_RESULTS from documents.conf.settings import GROUP_SHOW_EMPTY from documents.conf.settings import DEFAULT_TRANSFORMATIONS from documents.conf.settings import AUTOMATIC_OCR +from documents.conf.settings import UNCOMPRESS_COMPRESSED_LOCAL_FILES +from documents.conf.settings import UNCOMPRESS_COMPRESSED_STAGING_FILES from documents import PERMISSION_DOCUMENT_CREATE, \ @@ -110,6 +115,47 @@ def document_create_sibling(request, document_id, multiple=True): return HttpResponseRedirect('%s?%s' % (url, urlencode(urldata))) +def _handle_save_document(request, document, form=None): + document.update_checksum() + document.update_mimetype() + document.update_page_count() + document.apply_default_transformations() + if AUTOMATIC_OCR: + document_queue = add_document_to_queue(document) + messages.success(request, _(u'Document: %(document)s was added to the OCR queue: %(queue)s.') % { + 'document':document, 'queue':document_queue.label}) + + if form and 'document_type_available_filenames' in form.cleaned_data: + if form.cleaned_data['document_type_available_filenames']: + document.file_filename = form.cleaned_data['document_type_available_filenames'].filename + document.save() + + save_metadata_list(decode_metadata_from_url(request.GET), document) + try: + document.create_fs_links() + except Exception, e: + messages.error(request, e) + + +def _handle_zip_file(request, uploaded_file, document_type): + filename = getattr(uploaded_file, 'filename', getattr(uploaded_file, 'name', '')) + if filename.lower().endswith('zip'): + zfobj = zipfile.ZipFile(uploaded_file) + for filename in zfobj.namelist(): + if not filename.endswith('/'): + zip_document = Document(file=SimpleUploadedFile( + name=filename, content=zfobj.read(filename)), + document_type=document_type) + zip_document.save() + _handle_save_document(request, zip_document) + messages.success(request, _(u'Extracted file: %s, uploaded successfully.') % filename) + #Signal that uploaded file was a zip file + return True + else: + #Otherwise tell parent to handle file + return False + + def upload_document_with_type(request, document_type_id, multiple=True): permissions = [PERMISSION_DOCUMENT_CREATE] try: @@ -128,28 +174,14 @@ def upload_document_with_type(request, document_type_id, multiple=True): local_form = DocumentForm(request.POST, request.FILES, prefix='local', initial={'document_type':document_type}) if local_form.is_valid(): - instance = local_form.save() - instance.update_checksum() - instance.update_mimetype() - instance.update_page_count() - instance.apply_default_transformations() - if AUTOMATIC_OCR: - document_queue = add_document_to_queue(instance) - messages.success(request, _(u'Document: %(document)s was added to the OCR queue: %(queue)s.') % { - 'document':instance, 'queue':document_queue.label}) - - if 'document_type_available_filenames' in local_form.cleaned_data: - if local_form.cleaned_data['document_type_available_filenames']: - instance.file_filename = local_form.cleaned_data['document_type_available_filenames'].filename - instance.save() - - save_metadata_list(decode_metadata_from_url(request.GET), instance) - messages.success(request, _(u'Document uploaded successfully.')) try: - instance.create_fs_links() + if (not UNCOMPRESS_COMPRESSED_LOCAL_FILES) or (UNCOMPRESS_COMPRESSED_LOCAL_FILES and not _handle_zip_file(request, request.FILES['local-file'], document_type)): + instance = local_form.save() + _handle_save_document(request, instance, local_form) + messages.success(request, _(u'Document uploaded successfully.')) except Exception, e: messages.error(request, e) - + if multiple: return HttpResponseRedirect(request.get_full_path()) else: @@ -158,51 +190,24 @@ def upload_document_with_type(request, document_type_id, multiple=True): staging_form = StagingDocumentForm(request.POST, request.FILES, prefix='staging', initial={'document_type':document_type}) if staging_form.is_valid(): - staging_file_id = staging_form.cleaned_data['staging_file_id'] - try: - staging_file = StagingFile.get(staging_file_id) - except Exception, e: - messages.error(request, e) - else: - try: + staging_file = StagingFile.get(staging_form.cleaned_data['staging_file_id']) + if (not UNCOMPRESS_COMPRESSED_STAGING_FILES) or (UNCOMPRESS_COMPRESSED_STAGING_FILES and not _handle_zip_file(request, staging_file.upload(), document_type)): document = Document(file=staging_file.upload(), document_type=document_type) document.save() - document.update_checksum() - document.update_mimetype() - document.update_page_count() - document.apply_default_transformations() - if AUTOMATIC_OCR: - document_queue = add_document_to_queue(document) - messages.success(request, _(u'Document: %(document)s was added to the OCR queue: %(queue)s.') % { - 'document':document, 'queue':document_queue.label}) - except Exception, e: - messages.error(request, e) - else: - - if 'document_type_available_filenames' in staging_form.cleaned_data: - if staging_form.cleaned_data['document_type_available_filenames']: - document.file_filename = staging_form.cleaned_data['document_type_available_filenames'].filename - document.save() - - save_metadata_list(decode_metadata_from_url(request.GET), document) + _handle_save_document(request, document, staging_form) messages.success(request, _(u'Staging file: %s, uploaded successfully.') % staging_file.filename) - try: - document.create_fs_links() - except Exception, e: - messages.error(request, e) - - if DELETE_STAGING_FILE_AFTER_UPLOAD: - try: - staging_file.delete() - messages.success(request, _(u'Staging file: %s, deleted successfully.') % staging_file.filename) - except Exception, e: - messages.error(request, e) + + if DELETE_STAGING_FILE_AFTER_UPLOAD: + staging_file.delete() + messages.success(request, _(u'Staging file: %s, deleted successfully.') % staging_file.filename) + except Exception, e: + messages.error(request, e) - if multiple: - return HttpResponseRedirect(request.META['HTTP_REFERER']) - else: - return HttpResponseRedirect(reverse('document_list')) + if multiple: + return HttpResponseRedirect(request.META['HTTP_REFERER']) + else: + return HttpResponseRedirect(reverse('document_list')) context = { diff --git a/docs/Changelog.txt b/docs/Changelog.txt index c9368dee76..cdbc2da021 100644 --- a/docs/Changelog.txt +++ b/docs/Changelog.txt @@ -14,3 +14,4 @@ * Added unpaper to the OCR convertion pipe * Added support for concurrent, queued OCR processing using celery * Added sentry to monitor and store error for later debugging +* Zip files can now be uncompressed in memory and their content uploaded individually in one step diff --git a/docs/TODO b/docs/TODO index 4abf45d113..e81dcc3a30 100755 --- a/docs/TODO +++ b/docs/TODO @@ -35,6 +35,7 @@ * Count pages in a PDF file http://pybrary.net/pyPdf/ - NOT NEEDED * Support distributed OCR queues (RabbitMQ & Celery?) - DONE * MuliThreading deferred OCR - DONE +* Handle ziped or rar archives - DONE (zip only) * Role editing view under setup - STARTED * Scheduled maintenance (cleanup, deferred OCR's) - DONE * Document list filtering by metadata @@ -62,7 +63,6 @@ * DXF viewer - http://code.google.com/p/dxf-reader/source/browse/#svn%2Ftrunk * Support spreadsheets, wordprocessing docs using openoffice in server mode * WebDAV support -* Handle ziped or rar archives * Display preferences 'document transformations' (Rotation, default zoom) * Gallery view for document groups * Download metadata group documents as a single zip file @@ -80,3 +80,4 @@ * Enable/disable ocr queue view & links * Restrict view permission free form rename * Add per node max ocr concurrent execution +* Staging file hash colition when same file with different name, newhash = content hash + filename hash diff --git a/settings.py b/settings.py index 6e17899fc9..adf366e538 100755 --- a/settings.py +++ b/settings.py @@ -185,7 +185,8 @@ LOGIN_EXEMPT_URLS = ( #DOCUMENTS_STAGING_FILES_PREVIEW_SIZE = '640x480' #DOCUMENTS_AUTOMATIC_OCR = False #DOCUMENTS_ENABLE_SINGLE_DOCUMENT_UPLOAD = True - +#DOCUMENTS_UNCOMPRESS_COMPRESSED_LOCAL_FILES = True +#DOCUMENTS_UNCOMPRESS_COMPRESSED_STAGING_FILES = True # Saving #DOCUMENTS_CHECKSUM_FUNCTION = lambda x: hashlib.sha256(x).hexdigest())