diff --git a/apps/common/__init__.py b/apps/common/__init__.py index 9d177e93ed..1172d62f79 100644 --- a/apps/common/__init__.py +++ b/apps/common/__init__.py @@ -2,11 +2,10 @@ import tempfile from django.utils.translation import ugettext_lazy as _ -from common.conf import settings as common_settings from navigation.api import register_links -TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY \ - if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp() +from common.conf import settings as common_settings +from common.utils import validate_path def has_usable_password(context): @@ -17,3 +16,6 @@ current_user_details = {'text': _(u'user details'), 'view': 'current_user_detail current_user_edit = {'text': _(u'edit details'), 'view': 'current_user_edit', 'famfam': 'vcard_edit'} register_links(['current_user_details', 'current_user_edit', 'password_change_view'], [current_user_details, current_user_edit, password_change_view], menu_name='secondary_menu') + +if (validate_path(common_settings.TEMPORARY_DIRECTORY) == False) or (not common_settings.TEMPORARY_DIRECTORY): + setattr(common_settings, 'TEMPORARY_DIRECTORY', tempfile.mkdtemp()) diff --git a/apps/common/utils.py b/apps/common/utils.py index 13abb05627..eacaba7923 100644 --- a/apps/common/utils.py +++ b/apps/common/utils.py @@ -2,6 +2,7 @@ import os import re import types +import tempfile from django.utils.http import urlquote as django_urlquote from django.utils.http import urlencode as django_urlencode @@ -12,6 +13,15 @@ from django.contrib.contenttypes.models import ContentType from django.contrib.auth.models import User +try: + from python_magic import magic + USE_PYTHON_MAGIC = True +except: + import mimetypes + mimetypes.init() + USE_PYTHON_MAGIC = False + + def urlquote(link=None, get=None): u''' This method does both: urlquote() and urlencode() @@ -337,3 +347,50 @@ def return_diff(old_obj, new_obj, attrib_list=None): } return diff_dict + + +def get_mimetype(filepath): + """ + Determine a file's mimetype by calling the system's libmagic + library via python-magic or fallback to use python's mimetypes + library + """ + file_mimetype = u'' + file_mime_encoding = u'' + + if USE_PYTHON_MAGIC: + if os.path.exists(filepath): + try: + source = open(filepath, 'r') + mime = magic.Magic(mime=True) + file_mimetype = mime.from_buffer(source.read()) + source.seek(0) + mime_encoding = magic.Magic(mime_encoding=True) + file_mime_encoding = mime_encoding.from_buffer(source.read()) + finally: + if source: + source.close() + else: + path, filename = os.path.split(filepath) + file_mimetype, file_mime_encoding = mimetypes.guess_type(filename) + + return file_mimetype, file_mime_encoding + + +def validate_path(path): + if os.path.exists(path) != True: + # If doesn't exist try to create it + try: + os.mkdir(path) + except: + return False + + # Check if it is writable + try: + fd, test_filepath = tempfile.mkstemp(dir=path) + os.close(fd) + os.unlink(test_filepath) + except: + return False + + return True diff --git a/apps/converter/api.py b/apps/converter/api.py index 2bed8c6125..3a5b855ada 100644 --- a/apps/converter/api.py +++ b/apps/converter/api.py @@ -2,14 +2,12 @@ import os import subprocess import hashlib -from common import TEMPORARY_DIRECTORY -from documents.utils import document_save_to_temp_dir +from common.conf.settings import TEMPORARY_DIRECTORY from converter.conf.settings import UNOCONV_PATH -from converter.exceptions import UnpaperError, OfficeConversionError +from converter.exceptions import OfficeConversionError from converter.literals import DEFAULT_PAGE_NUMBER, \ - QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \ - DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH + DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT from converter import backend from converter.literals import TRANSFORMATION_CHOICES @@ -17,6 +15,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \ TRANSFORMATION_ZOOM from converter.literals import DIMENSION_SEPARATOR +from converter.utils import cleanup HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() @@ -24,15 +23,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [ u'ods', u'docx', u'doc' ] -def cleanup(filename): - """ - Tries to remove the given filename. Ignores non-existent files - """ - try: - os.remove(filename) - except OSError: - pass - def execute_unoconv(input_filepath, arguments=''): """ @@ -70,26 +60,19 @@ def convert_office_document(input_filepath): return None -def convert_document(document, *args, **kwargs): - document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs) - if os.path.exists(document_filepath): - return document_filepath - - return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs) - - -def convert(input_filepath, cleanup_files=True, *args, **kwargs): +def convert(input_filepath, output_filepath=None, cleanup_files=False, *args, **kwargs): size = kwargs.get('size') file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT) zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL) rotation = kwargs.get('rotation', DEFAULT_ROTATION) page = kwargs.get('page', DEFAULT_PAGE_NUMBER) - quality = kwargs.get('quality', QUALITY_DEFAULT) transformations = kwargs.get('transformations', []) unoconv_output = None - output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs) + if output_filepath is None: + output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs) + if os.path.exists(output_filepath): return output_filepath @@ -125,7 +108,7 @@ def convert(input_filepath, cleanup_files=True, *args, **kwargs): ) try: - backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format) + backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format) finally: if cleanup_files: cleanup(input_filepath) diff --git a/apps/converter/backends/graphicsmagick/base.py b/apps/converter/backends/graphicsmagick/base.py index 54ebbaaa95..1d70108a94 100644 --- a/apps/converter/backends/graphicsmagick/base.py +++ b/apps/converter/backends/graphicsmagick/base.py @@ -3,7 +3,6 @@ import re from converter.conf.settings import GM_PATH from converter.conf.settings import GM_SETTINGS -from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, \ IdentifyError from converter.backends import ConverterBase @@ -31,8 +30,10 @@ class ConverterClass(ConverterBase): raise IdentifyError(proc.stderr.readline()) return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): arguments = [] + + if transformations: for transformation in transformations: if transformation['transformation'] == TRANSFORMATION_RESIZE: @@ -51,7 +52,7 @@ class ConverterClass(ConverterBase): arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) - if format == u'jpeg': + if file_format.lower() == u'jpeg' or file_format.lower() == u'jpg': arguments.append(u'-quality') arguments.append(u'85') @@ -64,7 +65,6 @@ class ConverterClass(ConverterBase): command = [] command.append(unicode(GM_PATH)) command.append(u'convert') - command.extend(unicode(QUALITY_SETTINGS[quality]).split()) command.extend(unicode(GM_SETTINGS).split()) command.append(unicode(input_arg)) if arguments: diff --git a/apps/converter/backends/imagemagick/base.py b/apps/converter/backends/imagemagick/base.py index 4f924316ed..977da783c2 100644 --- a/apps/converter/backends/imagemagick/base.py +++ b/apps/converter/backends/imagemagick/base.py @@ -3,7 +3,6 @@ import re from converter.conf.settings import IM_IDENTIFY_PATH from converter.conf.settings import IM_CONVERT_PATH -from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS from converter.exceptions import ConvertError, UnknownFormat, \ IdentifyError from converter.backends import ConverterBase @@ -30,7 +29,7 @@ class ConverterClass(ConverterBase): raise IdentifyError(proc.stderr.readline()) return proc.stdout.read() - def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): arguments = [] if transformations: for transformation in transformations: @@ -50,7 +49,7 @@ class ConverterClass(ConverterBase): arguments.append(u'-rotate') arguments.append(u'%s' % transformation['arguments']['degrees']) - if format == u'jpeg': + if file_format.lower() == u'jpeg' or file_format.lower() == u'jpg': arguments.append(u'-quality') arguments.append(u'85') @@ -62,7 +61,6 @@ class ConverterClass(ConverterBase): command = [] command.append(unicode(IM_CONVERT_PATH)) - command.extend(unicode(QUALITY_SETTINGS[quality]).split()) command.append(unicode(input_arg)) if arguments: command.extend(arguments) diff --git a/apps/converter/backends/python/base.py b/apps/converter/backends/python/base.py index 25448346ff..e854ab6243 100644 --- a/apps/converter/backends/python/base.py +++ b/apps/converter/backends/python/base.py @@ -1,16 +1,21 @@ +import tempfile +import os + import slate from PIL import Image +import ghostscript from django.utils.translation import ugettext_lazy as _ -from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS +from common.utils import get_mimetype + from converter.exceptions import ConvertError, UnknownFormat, IdentifyError from converter.backends import ConverterBase from converter.literals import TRANSFORMATION_RESIZE, \ TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM -from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \ +from converter.literals import DEFAULT_PAGE_NUMBER, \ DEFAULT_FILE_FORMAT -from converter.utils import get_mimetype +from converter.utils import cleanup class ConverterClass(ConverterBase): @@ -42,11 +47,45 @@ class ConverterClass(ConverterBase): return page_count - def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT): + tmpfile = None + mimetype, encoding = get_mimetype(input_filepath) + if mimetype == 'application/pdf': + # If file is a PDF open it with ghostscript and convert it to + # TIFF + first_page_tmpl = '-dFirstPage=%d' % page + last_page_tmpl = '-dLastPage=%d' % page + fd, tmpfile = tempfile.mkstemp() + os.close(fd) + output_file_tmpl = '-sOutputFile=%s' % tmpfile + input_file_tmpl = '-f%s' % input_filepath + args = [ + 'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH', + '-dNOPAUSE', '-dNOPROMPT', + first_page_tmpl, last_page_tmpl, + '-sDEVICE=jpeg', '-dJPEGQ=75', + '-r150', output_file_tmpl, + input_file_tmpl, + '-c "60000000 setvmthreshold"', # use 30MB + '-dNOGC', # No garbage collection + '-dMaxBitmap=500000000', + '-dAlignToPixels=0', + '-dGridFitTT=0', + '-dTextAlphaBits=4', + '-dGraphicsAlphaBits=4', + ] + + ghostscript.Ghostscript(*args) + page = 1 # Don't execute the following while loop + input_filepath = tmpfile + try: im = Image.open(input_filepath) except Exception: # Python Imaging Library doesn't recognize it as an image raise UnknownFormat + finally: + if tmpfile: + cleanup(tmpfile) current_page = 0 try: @@ -58,12 +97,12 @@ class ConverterClass(ConverterBase): pass # end of sequence if transformations: + aspect = 1.0 * im.size[0] / im.size[1] for transformation in transformations: - aspect = 1.0 * im.size[1] / im.size[0] if transformation['transformation'] == TRANSFORMATION_RESIZE: width = int(transformation['arguments']['width']) height = int(transformation['arguments'].get('height', 1.0 * width * aspect)) - im = im.resize((width, height), Image.ANTIALIAS) + im = self.resize(im, (width, height)) elif transformation['transformation'] == TRANSFORMATION_ZOOM: decimal_value = float(transformation['arguments']['percent']) / 100 im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1])) @@ -73,6 +112,7 @@ class ConverterClass(ConverterBase): if im.mode not in ('L', 'RGB'): im = im.convert('RGB') + im.save(output_filepath, format=file_format) def get_format_list(self): @@ -91,3 +131,41 @@ class ConverterClass(ConverterBase): TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \ TRANSFORMATION_ZOOM ] + + # From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python + def resize(self, img, box, fit=False, out=None): + '''Downsample the image. + @param img: Image - an Image-object + @param box: tuple(x, y) - the bounding box of the result image + @param fit: boolean - crop the image to fill the box + @param out: file-like-object - save the image into the output stream + ''' + #preresize image with factor 2, 4, 8 and fast algorithm + factor = 1 + while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]: + factor *=2 + if factor > 1: + img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST) + + #calculate the cropping box and get the cropped part + if fit: + x1 = y1 = 0 + x2, y2 = img.size + wRatio = 1.0 * x2/box[0] + hRatio = 1.0 * y2/box[1] + if hRatio > wRatio: + y1 = y2/2-box[1]*wRatio/2 + y2 = y2/2+box[1]*wRatio/2 + else: + x1 = x2/2-box[0]*hRatio/2 + x2 = x2/2+box[0]*hRatio/2 + img = img.crop((x1,y1,x2,y2)) + + #Resize the image with best quality algorithm ANTI-ALIAS + img.thumbnail(box, Image.ANTIALIAS) + + if out: + #save it into a file-like object + img.save(out, "JPEG", quality=75) + else: + return img diff --git a/apps/converter/conf/settings.py b/apps/converter/conf/settings.py index 95aee33b92..08377880b4 100644 --- a/apps/converter/conf/settings.py +++ b/apps/converter/conf/settings.py @@ -14,9 +14,7 @@ register_settings( {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')}, {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True}, #{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'}, - {'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''}, - {'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''}, - {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'}, - {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'}, + #{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'}, + #{'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'}, ] ) diff --git a/apps/converter/exceptions.py b/apps/converter/exceptions.py index c906fc5c95..1880f0ba39 100644 --- a/apps/converter/exceptions.py +++ b/apps/converter/exceptions.py @@ -13,13 +13,6 @@ class UnknownFormat(ConvertError): pass -class UnpaperError(ConvertError): - """ - Raised by unpaper - """ - pass - - class IdentifyError(ConvertError): """ Raised by identify diff --git a/apps/converter/literals.py b/apps/converter/literals.py index 915630416a..66a17d0d67 100644 --- a/apps/converter/literals.py +++ b/apps/converter/literals.py @@ -1,27 +1,10 @@ from django.utils.translation import ugettext_lazy as _ -from converter.conf.settings import DEFAULT_OPTIONS -from converter.conf.settings import LOW_QUALITY_OPTIONS -from converter.conf.settings import HIGH_QUALITY_OPTIONS -from converter.conf.settings import PRINT_QUALITY_OPTIONS - DEFAULT_ZOOM_LEVEL = 100 DEFAULT_ROTATION = 0 DEFAULT_PAGE_NUMBER = 1 DEFAULT_FILE_FORMAT = u'jpeg' -QUALITY_DEFAULT = u'quality_default' -QUALITY_LOW = u'quality_low' -QUALITY_HIGH = u'quality_high' -QUALITY_PRINT = u'quality_print' - -QUALITY_SETTINGS = { - QUALITY_DEFAULT: DEFAULT_OPTIONS, - QUALITY_LOW: LOW_QUALITY_OPTIONS, - QUALITY_HIGH: HIGH_QUALITY_OPTIONS, - QUALITY_PRINT: PRINT_QUALITY_OPTIONS -} - DIMENSION_SEPARATOR = u'x' TRANSFORMATION_RESIZE = u'resize' diff --git a/apps/converter/utils.py b/apps/converter/utils.py index 4653b6dc9d..26ad9c4b74 100644 --- a/apps/converter/utils.py +++ b/apps/converter/utils.py @@ -2,14 +2,6 @@ import os from django.core.exceptions import ImproperlyConfigured from django.utils.importlib import import_module - -try: - from python_magic import magic - USE_PYTHON_MAGIC = True -except: - import mimetypes - mimetypes.init() - USE_PYTHON_MAGIC = False #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python @@ -82,30 +74,11 @@ def load_backend(): raise # If there's some other error, this must be an error in Mayan itself. -def get_mimetype(filepath): +def cleanup(filename): """ - Determine a file's mimetype by calling the system's libmagic - library via python-magic or fallback to use python's mimetypes - library + Tries to remove the given filename. Ignores non-existent files """ - file_mimetype = u'' - file_mime_encoding = u'' - - if USE_PYTHON_MAGIC: - if os.path.exists(filepath): - try: - source = open(filepath, 'r') - mime = magic.Magic(mime=True) - file_mimetype = mime.from_buffer(source.read()) - source.seek(0) - mime_encoding = magic.Magic(mime_encoding=True) - file_mime_encoding = mime_encoding.from_buffer(source.read()) - finally: - if source: - source.close() - else: - path, filename = os.path.split(filepath) - file_mimetype, file_mime_encoding = mimetypes.guess_type(filename) - - return file_mimetype, file_mime_encoding - + try: + os.remove(filename) + except OSError: + pass diff --git a/apps/documents/__init__.py b/apps/documents/__init__.py index 35007412d7..162abfd605 100644 --- a/apps/documents/__init__.py +++ b/apps/documents/__init__.py @@ -2,6 +2,7 @@ from django.utils.translation import ugettext_lazy as _ from django.core.urlresolvers import reverse from django.conf import settings +from common.utils import validate_path from navigation.api import register_links, register_top_menu, \ register_model_list_columns, register_multi_item_links, \ register_sidebar_template @@ -24,8 +25,25 @@ from documents.literals import HISTORY_DOCUMENT_CREATED, \ HISTORY_DOCUMENT_EDITED, HISTORY_DOCUMENT_DELETED from documents.conf.settings import ZOOM_MAX_LEVEL from documents.conf.settings import ZOOM_MIN_LEVEL +from documents.conf.settings import CACHE_PATH from documents.widgets import document_thumbnail +# Document page links expressions +def is_first_page(context): + return context['object'].page_number <= 1 + + +def is_last_page(context): + return context['object'].page_number >= context['object'].document.documentpage_set.count() + + +def is_min_zoom(context): + return context['zoom'] <= ZOOM_MIN_LEVEL + + +def is_max_zoom(context): + return context['zoom'] >= ZOOM_MAX_LEVEL + # Permission setup set_namespace_title('documents', _(u'Documents')) register_permission(PERMISSION_DOCUMENT_CREATE) @@ -48,23 +66,6 @@ register_history_type(HISTORY_DOCUMENT_CREATED) register_history_type(HISTORY_DOCUMENT_EDITED) register_history_type(HISTORY_DOCUMENT_DELETED) - -# Document page links expressions -def is_first_page(context): - return context['object'].page_number <= 1 - - -def is_last_page(context): - return context['object'].page_number >= context['object'].document.documentpage_set.count() - - -def is_min_zoom(context): - return context['zoom'] <= ZOOM_MIN_LEVEL - - -def is_max_zoom(context): - return context['zoom'] >= ZOOM_MAX_LEVEL - document_list = {'text': _(u'all documents'), 'view': 'document_list', 'famfam': 'page', 'permissions': [PERMISSION_DOCUMENT_VIEW]} document_list_recent = {'text': _(u'recent documents'), 'view': 'document_list_recent', 'famfam': 'page', 'permissions': [PERMISSION_DOCUMENT_VIEW]} document_create_multiple = {'text': _(u'upload new documents'), 'view': 'document_create_multiple', 'famfam': 'page_add', 'permissions': [PERMISSION_DOCUMENT_CREATE]} @@ -198,3 +199,5 @@ register_sidebar_template(['document_type_list'], 'document_types_help.html') register_links(Document, [document_view_simple], menu_name='form_header', position=0) register_links(Document, [document_view_advanced], menu_name='form_header', position=1) register_links(Document, [document_history_view], menu_name='form_header') + +validate_path(CACHE_PATH) diff --git a/apps/documents/conf/settings.py b/apps/documents/conf/settings.py index 4c7749624c..5da5542b94 100644 --- a/apps/documents/conf/settings.py +++ b/apps/documents/conf/settings.py @@ -2,8 +2,10 @@ import hashlib import uuid +import os from django.utils.translation import ugettext_lazy as _ +from django.conf import settings from storage.backends.filebasedstorage import FileBasedStorage from smart_settings.api import register_settings @@ -38,5 +40,7 @@ register_settings( {'name': u'ZOOM_MAX_LEVEL', 'global_name': u'DOCUMENTS_ZOOM_MAX_LEVEL', 'default': 200, 'description': _(u'Maximum amount in percent (%) to allow user to zoom in a document page interactively.')}, {'name': u'ZOOM_MIN_LEVEL', 'global_name': u'DOCUMENTS_ZOOM_MIN_LEVEL', 'default': 50, 'description': _(u'Minimum amount in percent (%) to allow user to zoom out a document page interactively.')}, {'name': u'ROTATION_STEP', 'global_name': u'DOCUMENTS_ROTATION_STEP', 'default': 90, 'description': _(u'Amount in degrees to rotate a document page per user interaction.')}, + # + {'name': u'CACHE_PATH', 'global_name': u'DOCUMENTS_CACHE_PATH', 'default': os.path.join(settings.PROJECT_ROOT, 'image_cache'), 'exists': True}, ] ) diff --git a/apps/documents/models.py b/apps/documents/models.py index b3eadb08e7..d33bf1112d 100644 --- a/apps/documents/models.py +++ b/apps/documents/models.py @@ -1,11 +1,13 @@ import os import tempfile +import hashlib from django.db import models from django.utils.translation import ugettext_lazy as _ from django.contrib.auth.models import User from django.contrib.contenttypes import generic from django.contrib.comments.models import Comment +from django.conf import settings from python_magic import magic @@ -13,12 +15,26 @@ from taggit.managers import TaggableManager from dynamic_search.api import register from converter.api import get_page_count from converter.api import get_available_transformations_choices +from converter.api import create_image_cache_filename, convert +from converter.exceptions import UnknownFormat, UnkownConvertError from documents.conf.settings import CHECKSUM_FUNCTION from documents.conf.settings import UUID_FUNCTION from documents.conf.settings import STORAGE_BACKEND +from documents.conf.settings import PREVIEW_SIZE +from documents.conf.settings import THUMBNAIL_SIZE +from documents.conf.settings import CACHE_PATH + from documents.managers import RecentDocumentManager, \ DocumentPageTransformationManager +from documents.utils import document_save_to_temp_dir +from documents.literals import PICTURE_ERROR_SMALL, PICTURE_ERROR_MEDIUM, \ + PICTURE_UNKNOWN_SMALL, PICTURE_UNKNOWN_MEDIUM +from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \ + DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER + +# document image cache name hash function +HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest() def get_filename_from_uuid(instance, filename): @@ -201,8 +217,7 @@ class Document(models.Model): exists in storage """ return self.file.storage.exists(self.file.path) - - + def apply_default_transformations(self, transformations): #Only apply default transformations on new documents if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0: @@ -216,6 +231,29 @@ class Document(models.Model): ) page_transformation.save() + + def get_image_cache_name(self, page): + document_page = self.documentpage_set.get(page_number=page) + transformations, warnings = document_page.get_transformation_list() + hash_value = HASH_FUNCTION(u''.join([self.checksum, unicode(page), unicode(transformations)])) + cache_file_path = os.path.join(CACHE_PATH, hash_value) + if os.path.exists(cache_file_path): + return cache_file_path + else: + document_file = document_save_to_temp_dir(self, self.checksum) + return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations) + + def get_image(self, size=PREVIEW_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION): + try: + image_cache_name = self.get_image_cache_name(page=page) + output_file = convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation) + except UnknownFormat: + output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_SMALL) + except UnkownConvertError: + output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL) + except Exception, e: + output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL) + return output_file class DocumentTypeFilename(models.Model): diff --git a/apps/documents/templatetags/printing_tags.py b/apps/documents/templatetags/printing_tags.py index 33f560bdf1..d556f6e10d 100644 --- a/apps/documents/templatetags/printing_tags.py +++ b/apps/documents/templatetags/printing_tags.py @@ -1,6 +1,6 @@ from django.template import Library, Node, Variable -from converter.api import get_document_dimensions, QUALITY_PRINT +from converter.api import get_document_dimensions from documents.views import calculate_converter_arguments from documents.conf.settings import PRINT_SIZE @@ -14,8 +14,7 @@ class GetImageSizeNode(Node): def render(self, context): document = Variable(self.document).resolve(context) - arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, quality=QUALITY_PRINT) - width, height = get_document_dimensions(document, **arguments) + width, height = get_document_dimensions(document) context[u'document_width'], context['document_height'] = width, height context[u'document_aspect'] = float(width) / float(height) return u'' diff --git a/apps/documents/urls.py b/apps/documents/urls.py index 4a8dcd2d46..19020a3448 100644 --- a/apps/documents/urls.py +++ b/apps/documents/urls.py @@ -1,7 +1,5 @@ from django.conf.urls.defaults import patterns, url -from converter.literals import QUALITY_HIGH, QUALITY_PRINT - from documents.conf.settings import PREVIEW_SIZE from documents.conf.settings import PRINT_SIZE from documents.conf.settings import THUMBNAIL_SIZE @@ -24,8 +22,8 @@ urlpatterns = patterns('documents.views', url(r'^(?P\d+)/display/preview/$', 'get_document_image', {'size': PREVIEW_SIZE}, 'document_preview'), url(r'^(?P\d+)/display/preview/multipage/$', 'get_document_image', {'size': MULTIPAGE_PREVIEW_SIZE}, 'document_preview_multipage'), url(r'^(?P\d+)/display/thumbnail/$', 'get_document_image', {'size': THUMBNAIL_SIZE}, 'document_thumbnail'), - url(r'^(?P\d+)/display/$', 'get_document_image', {'size': DISPLAY_SIZE, 'quality': QUALITY_HIGH}, 'document_display'), - url(r'^(?P\d+)/display/print/$', 'get_document_image', {'size': PRINT_SIZE, 'quality': QUALITY_PRINT}, 'document_display_print'), + url(r'^(?P\d+)/display/$', 'get_document_image', {'size': DISPLAY_SIZE}, 'document_display'), + url(r'^(?P\d+)/display/print/$', 'get_document_image', {'size': PRINT_SIZE}, 'document_display_print'), url(r'^(?P\d+)/download/$', 'document_download', (), 'document_download'), url(r'^(?P\d+)/create/siblings/$', 'document_create_siblings', (), 'document_create_siblings'), diff --git a/apps/documents/utils.py b/apps/documents/utils.py index 0c8122bcbc..32658d5f43 100644 --- a/apps/documents/utils.py +++ b/apps/documents/utils.py @@ -1,6 +1,6 @@ import os -from common import TEMPORARY_DIRECTORY +from common.conf.settings import TEMPORARY_DIRECTORY #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python diff --git a/apps/documents/views.py b/apps/documents/views.py index ee9be82b3e..f86a73f3d8 100644 --- a/apps/documents/views.py +++ b/apps/documents/views.py @@ -20,11 +20,8 @@ from common.widgets import two_state_template from common.literals import PAGE_SIZE_DIMENSIONS, \ PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE from common.conf.settings import DEFAULT_PAPER_SIZE -from converter.api import convert_document -from converter.exceptions import UnkownConvertError, UnknownFormat from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \ - DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \ - DEFAULT_PAGE_NUMBER + DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER from filetransfers.api import serve_file from grouping.utils import get_document_group_subtemplate from metadata.api import save_metadata_list, \ @@ -287,7 +284,7 @@ def document_edit(request, document_id): }, context_instance=RequestContext(request)) -def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT): +def get_document_image(request, document_id, size=PREVIEW_SIZE): check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW]) document = get_object_or_404(Document, pk=document_id) @@ -304,36 +301,7 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_ rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360 - document_page = get_object_or_404(document.documentpage_set, page_number=page) - transformations, warnings = document_page.get_transformation_list() - - if warnings and (request.user.is_staff or request.user.is_superuser): - for warning in warnings: - messages.warning(request, _(u'Page transformation error: %s') % warning) - - try: - output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations) - except UnkownConvertError, e: - if request.user.is_staff or request.user.is_superuser: - messages.error(request, e) - if size == THUMBNAIL_SIZE: - output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL) - else: - output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_MEDIUM) - except UnknownFormat: - if size == THUMBNAIL_SIZE: - output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_SMALL) - else: - output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_MEDIUM) - except Exception, e: - if request.user.is_staff or request.user.is_superuser: - messages.error(request, e) - if size == THUMBNAIL_SIZE: - output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL) - else: - output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_MEDIUM) - finally: - return sendfile.sendfile(request, output_file) + return sendfile.sendfile(request, document.get_image(size=size, page=page, zoom=zoom, rotation=rotation)) def document_download(request, document_id): @@ -804,13 +772,14 @@ def document_print(request, document_id): def document_hard_copy(request, document_id): + #TODO: FIXME check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW]) document = get_object_or_404(Document, pk=document_id) RecentDocument.objects.add_document_for_user(request.user, document) - arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, file_format=DEFAULT_FILE_FORMAT, quality=QUALITY_PRINT) + arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, file_format=DEFAULT_FILE_FORMAT) # Pre-generate convert_document(document, **arguments) diff --git a/apps/ocr/api.py b/apps/ocr/api.py index 585ef9ac2b..108fb1c5c2 100644 --- a/apps/ocr/api.py +++ b/apps/ocr/api.py @@ -9,18 +9,18 @@ import sys from django.utils.translation import ugettext as _ from django.utils.importlib import import_module -from common import TEMPORARY_DIRECTORY +from common.conf.settings import TEMPORARY_DIRECTORY from converter.api import convert from documents.models import DocumentPage -from documents.utils import document_save_to_temp_dir from ocr.conf.settings import TESSERACT_PATH from ocr.conf.settings import TESSERACT_LANGUAGE -from ocr.exceptions import TesseractError +from ocr.exceptions import TesseractError, UnpaperError from ocr.conf.settings import UNPAPER_PATH from ocr.parsers import parse_document_page from ocr.parsers.exceptions import ParserError, ParserUnknownFile -from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT +from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT, \ + DEFAULT_OCR_FILE_EXTENSION def get_language_backend(): @@ -56,8 +56,10 @@ def run_tesseract(input_filename, lang=None): os.close(fd) ocr_output = os.extsep.join([filepath, u'txt']) command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)] - if lang is not None: - command += [u'-l', lang] + + # TODO: Tesseract 3.0 segfaults + #if lang is not None: + # command.extend([u'-l', lang]) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() @@ -67,7 +69,13 @@ def run_tesseract(input_filename, lang=None): cleanup(ocr_output) raise TesseractError(error_text) - return codecs.open(ocr_output, 'r', 'utf-8'), ocr_output + fd = codecs.open(ocr_output, 'r', 'utf-8') + text = fd.read().strip() + fd.close() + + os.unlink(filepath) + + return text def do_document_ocr(queue_document): @@ -82,36 +90,37 @@ def do_document_ocr(queue_document): parse_document_page(document_page) except (ParserError, ParserUnknownFile): # Fall back to doing visual OCR - transformations = [] - document_transformations, warnings = document_page.get_transformation_list() - ocr_transformations, warnings = queue_document.get_transformation_list() - transformations.extend(document_transformations) - transformations.extend(ocr_transformations) - - unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT) - - document_filepath = os.path.join(TEMPORARY_DIRECTORY, document_page.document.uuid) - unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) + ##ocr_transformations, warnings = queue_document.get_transformation_list() - document.save_to_file(document_filepath) + document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number) + unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT) + unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename) + + unpaper_input=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT) + execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath) + + #from PIL import Image, ImageOps + #im = Image.open(document_filepath) + ##if im.mode=='RGBA': + ## im=im.convert('RGB') + ##im = im.convert('L') + #im = ImageOps.grayscale(im) + #im.save(unpaper_output_filepath) - transformed_filepath=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, page=document_page.page_number, transformations=transformations) - execute_unpaper(input_filepath=transformed_filepath, output_filepath=unpaper_output_filepath) # Convert to TIFF pre_ocr_filepath = output_filepath=convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT) # Tesseract needs an explicit file extension - pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_FORMAT]) + pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION]) os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext) try: - fd, ocr_output = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) - document_page.content = ocr_cleanup(fd.read().strip()) + ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE) + + document_page.content = ocr_cleanup(ocr_text) document_page.page_label = _(u'Text from OCR') document_page.save() - fd.close() - cleanup(ocr_output) finally: cleanup(pre_ocr_filepath_w_ext) - cleanup(transformed_filepath) + cleanup(unpaper_input) cleanup(document_filepath) cleanup(unpaper_output_filepath) @@ -155,6 +164,7 @@ def execute_unpaper(input_filepath, output_filepath): command = [] command.append(UNPAPER_PATH) command.append(u'--overwrite') + command.append(u'--no-multi-pages') command.append(input_filepath) command.append(output_filepath) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE) diff --git a/apps/ocr/exceptions.py b/apps/ocr/exceptions.py index b1ec8c3fe3..41ebe0c8ca 100644 --- a/apps/ocr/exceptions.py +++ b/apps/ocr/exceptions.py @@ -4,3 +4,10 @@ class AlreadyQueued(Exception): class TesseractError(Exception): pass + + +class UnpaperError(Exception): + """ + Raised by unpaper + """ + pass diff --git a/apps/ocr/literals.py b/apps/ocr/literals.py index 6a33f0e712..946c063e38 100644 --- a/apps/ocr/literals.py +++ b/apps/ocr/literals.py @@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = ( (QUEUEDOCUMENT_STATE_ERROR, _(u'error')), ) -DEFAULT_OCR_FILE_FORMAT = u'tif' -UNPAPER_FILE_FORMAT = u'pnm' +DEFAULT_OCR_FILE_FORMAT = u'tiff' +DEFAULT_OCR_FILE_EXTENSION = u'tif' +UNPAPER_FILE_FORMAT = u'ppm'