Merge branch 'document_image_caching' into smart_staging

2011-07-21 03:50:19 -04:00
parent 57fed7608a 89fc258a59
commit 28ecd22944
20 changed files with 289 additions and 195 deletions
--- a/apps/common/init.py
+++ b/apps/common/init.py
@@ -2,11 +2,10 @@ import tempfile

 from django.utils.translation import ugettext_lazy as _

-from common.conf import settings as common_settings
 from navigation.api import register_links

-TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY \
-    if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp()
+from common.conf import settings as common_settings
+from common.utils import validate_path


 def has_usable_password(context):
@@ -17,3 +16,6 @@ current_user_details = {'text': _(u'user details'), 'view': 'current_user_detail
 current_user_edit = {'text': _(u'edit details'), 'view': 'current_user_edit', 'famfam': 'vcard_edit'}

 register_links(['current_user_details', 'current_user_edit', 'password_change_view'], [current_user_details, current_user_edit, password_change_view], menu_name='secondary_menu')
+
+if (validate_path(common_settings.TEMPORARY_DIRECTORY) == False) or (not common_settings.TEMPORARY_DIRECTORY):
+    setattr(common_settings, 'TEMPORARY_DIRECTORY', tempfile.mkdtemp())
--- a/apps/common/utils.py
+++ b/apps/common/utils.py
@@ -2,6 +2,7 @@
 import os
 import re
 import types
+import tempfile

 from django.utils.http import urlquote  as django_urlquote
 from django.utils.http import urlencode as django_urlencode
@@ -12,6 +13,15 @@ from django.contrib.contenttypes.models import ContentType
 from django.contrib.auth.models import User


+try:
+    from python_magic import magic
+    USE_PYTHON_MAGIC = True
+except:
+    import mimetypes
+    mimetypes.init()
+    USE_PYTHON_MAGIC = False
+    
+
 def urlquote(link=None, get=None):
    u'''
    This method does both: urlquote() and urlencode()
@@ -337,3 +347,50 @@ def return_diff(old_obj, new_obj, attrib_list=None):
            }

    return diff_dict
+
+
+def get_mimetype(filepath):
+    """
+    Determine a file's mimetype by calling the system's libmagic
+    library via python-magic or fallback to use python's mimetypes
+    library
+    """
+    file_mimetype = u''
+    file_mime_encoding = u''
+    
+    if USE_PYTHON_MAGIC:
+        if os.path.exists(filepath):
+            try:
+                source = open(filepath, 'r')
+                mime = magic.Magic(mime=True)
+                file_mimetype = mime.from_buffer(source.read())
+                source.seek(0)
+                mime_encoding = magic.Magic(mime_encoding=True)
+                file_mime_encoding = mime_encoding.from_buffer(source.read())
+            finally:
+                if source:
+                    source.close()
+    else:
+        path, filename = os.path.split(filepath)
+        file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
+        
+    return file_mimetype, file_mime_encoding
+
+
+def validate_path(path):
+    if os.path.exists(path) != True:
+        # If doesn't exist try to create it
+        try:
+            os.mkdir(path)
+        except:
+            return False
+    
+    # Check if it is writable
+    try:
+        fd, test_filepath = tempfile.mkstemp(dir=path)
+        os.close(fd)
+        os.unlink(test_filepath)
+    except:
+        return False
+        
+    return True
--- a/apps/converter/api.py
+++ b/apps/converter/api.py
@@ -2,14 +2,12 @@ import os
 import subprocess
 import hashlib

-from common import TEMPORARY_DIRECTORY
-from documents.utils import document_save_to_temp_dir
+from common.conf.settings import TEMPORARY_DIRECTORY

 from converter.conf.settings import UNOCONV_PATH
-from converter.exceptions import UnpaperError, OfficeConversionError
+from converter.exceptions import OfficeConversionError
 from converter.literals import DEFAULT_PAGE_NUMBER, \
-    QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
-    DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
+    DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT

 from converter import backend
 from converter.literals import TRANSFORMATION_CHOICES
@@ -17,6 +15,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
    TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
    TRANSFORMATION_ZOOM
 from converter.literals import DIMENSION_SEPARATOR    
+from converter.utils import cleanup

 HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
    
@@ -24,15 +23,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [
    u'ods', u'docx', u'doc'
 ]

-def cleanup(filename):
-    """
-    Tries to remove the given filename. Ignores non-existent files
-    """
-    try:
-        os.remove(filename)
-    except OSError:
-        pass
-

 def execute_unoconv(input_filepath, arguments=''):
    """
@@ -70,26 +60,19 @@ def convert_office_document(input_filepath):
    return None


-def convert_document(document, *args, **kwargs):
-    document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs)
-    if os.path.exists(document_filepath):
-        return document_filepath
-
-    return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
-
-
-def convert(input_filepath, cleanup_files=True, *args, **kwargs):
+def convert(input_filepath, output_filepath=None, cleanup_files=False, *args, **kwargs):
    size = kwargs.get('size')
    file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
    zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
    rotation = kwargs.get('rotation', DEFAULT_ROTATION)
    page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
-    quality = kwargs.get('quality', QUALITY_DEFAULT)
    transformations = kwargs.get('transformations', [])

    unoconv_output = None

-    output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
+    if output_filepath is None:
+        output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
+        
    if os.path.exists(output_filepath):
        return output_filepath

@@ -125,7 +108,7 @@ def convert(input_filepath, cleanup_files=True, *args, **kwargs):
        )           

    try:
-        backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format)
+        backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format)
    finally:
        if cleanup_files:
            cleanup(input_filepath)
--- a/apps/converter/backends/graphicsmagick/base.py
+++ b/apps/converter/backends/graphicsmagick/base.py
@@ -3,7 +3,6 @@ import re

 from converter.conf.settings import GM_PATH
 from converter.conf.settings import GM_SETTINGS
-from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, \
    IdentifyError
 from converter.backends import ConverterBase
@@ -31,8 +30,10 @@ class ConverterClass(ConverterBase):
            raise IdentifyError(proc.stderr.readline())
        return proc.stdout.read()

-    def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
        arguments = []
+
+
        if transformations:
            for transformation in transformations:
                if transformation['transformation'] == TRANSFORMATION_RESIZE:
@@ -51,7 +52,7 @@ class ConverterClass(ConverterBase):
                    arguments.append(u'-rotate')
                    arguments.append(u'%s' % transformation['arguments']['degrees'])

-        if format == u'jpeg':
+        if file_format.lower() == u'jpeg' or file_format.lower() == u'jpg':
            arguments.append(u'-quality')
            arguments.append(u'85')

@@ -64,7 +65,6 @@ class ConverterClass(ConverterBase):
        command = []
        command.append(unicode(GM_PATH))
        command.append(u'convert')
-        command.extend(unicode(QUALITY_SETTINGS[quality]).split())
        command.extend(unicode(GM_SETTINGS).split())
        command.append(unicode(input_arg))
        if arguments:
--- a/apps/converter/backends/imagemagick/base.py
+++ b/apps/converter/backends/imagemagick/base.py
@@ -3,7 +3,6 @@ import re

 from converter.conf.settings import IM_IDENTIFY_PATH
 from converter.conf.settings import IM_CONVERT_PATH
-from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
 from converter.exceptions import ConvertError, UnknownFormat, \
    IdentifyError
 from converter.backends import ConverterBase
@@ -30,7 +29,7 @@ class ConverterClass(ConverterBase):
            raise IdentifyError(proc.stderr.readline())
        return proc.stdout.read()

-    def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
        arguments = []
        if transformations:
            for transformation in transformations:
@@ -50,7 +49,7 @@ class ConverterClass(ConverterBase):
                    arguments.append(u'-rotate')
                    arguments.append(u'%s' % transformation['arguments']['degrees'])
                    
-        if format == u'jpeg':
+        if file_format.lower() == u'jpeg' or file_format.lower() == u'jpg':
            arguments.append(u'-quality')
            arguments.append(u'85')
        
@@ -62,7 +61,6 @@ class ConverterClass(ConverterBase):
                  
        command = []
        command.append(unicode(IM_CONVERT_PATH))
-        command.extend(unicode(QUALITY_SETTINGS[quality]).split())
        command.append(unicode(input_arg))
        if arguments:
            command.extend(arguments)
--- a/apps/converter/backends/python/base.py
+++ b/apps/converter/backends/python/base.py
@@ -1,16 +1,21 @@
+import tempfile
+import os
+
 import slate
 from PIL import Image
+import ghostscript

 from django.utils.translation import ugettext_lazy as _

-from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
+from common.utils import get_mimetype
+
 from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
 from converter.backends import ConverterBase
 from converter.literals import TRANSFORMATION_RESIZE, \
    TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
-from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
+from converter.literals import DEFAULT_PAGE_NUMBER, \
    DEFAULT_FILE_FORMAT
-from converter.utils import get_mimetype
+from converter.utils import cleanup


 class ConverterClass(ConverterBase):
@@ -42,11 +47,45 @@ class ConverterClass(ConverterBase):
            
        return page_count
    
-    def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+    def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
+        tmpfile = None
+        mimetype, encoding = get_mimetype(input_filepath)
+        if mimetype == 'application/pdf':
+            # If file is a PDF open it with ghostscript and convert it to
+            # TIFF
+            first_page_tmpl = '-dFirstPage=%d' % page
+            last_page_tmpl = '-dLastPage=%d' % page
+            fd, tmpfile = tempfile.mkstemp()
+            os.close(fd)
+            output_file_tmpl = '-sOutputFile=%s' % tmpfile
+            input_file_tmpl = '-f%s' % input_filepath
+            args = [
+                'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
+                '-dNOPAUSE', '-dNOPROMPT', 
+                first_page_tmpl, last_page_tmpl,
+                '-sDEVICE=jpeg', '-dJPEGQ=75',
+                '-r150', output_file_tmpl,
+                input_file_tmpl,
+                '-c "60000000 setvmthreshold"',  # use 30MB
+                '-dNOGC',  # No garbage collection
+                '-dMaxBitmap=500000000',
+                '-dAlignToPixels=0',
+                '-dGridFitTT=0',
+                '-dTextAlphaBits=4',
+                '-dGraphicsAlphaBits=4',                
+            ] 
+
+            ghostscript.Ghostscript(*args)
+            page = 1 # Don't execute the following while loop
+            input_filepath = tmpfile    
+
        try:
            im = Image.open(input_filepath)
        except Exception: # Python Imaging Library doesn't recognize it as an image
            raise UnknownFormat
+        finally:
+            if tmpfile:
+                cleanup(tmpfile)
        
        current_page = 0
        try:
@@ -58,12 +97,12 @@ class ConverterClass(ConverterBase):
            pass # end of sequence        

        if transformations:
+            aspect = 1.0 * im.size[0] / im.size[1]
            for transformation in transformations:
-                aspect = 1.0 * im.size[1] / im.size[0]
                if transformation['transformation'] == TRANSFORMATION_RESIZE:
                    width = int(transformation['arguments']['width'])
                    height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
-                    im = im.resize((width, height), Image.ANTIALIAS)
+                    im = self.resize(im, (width, height))
                elif transformation['transformation'] == TRANSFORMATION_ZOOM:
                    decimal_value = float(transformation['arguments']['percent']) / 100
                    im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1])) 
@@ -73,6 +112,7 @@ class ConverterClass(ConverterBase):

        if im.mode not in ('L', 'RGB'):
            im = im.convert('RGB')
+            
        im.save(output_filepath, format=file_format)

    def get_format_list(self):
@@ -91,3 +131,41 @@ class ConverterClass(ConverterBase):
            TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
            TRANSFORMATION_ZOOM
        ]
+
+    # From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
+    def resize(self, img, box, fit=False, out=None):
+        '''Downsample the image.
+        @param img: Image -  an Image-object
+        @param box: tuple(x, y) - the bounding box of the result image
+        @param fit: boolean - crop the image to fill the box
+        @param out: file-like-object - save the image into the output stream
+        '''
+        #preresize image with factor 2, 4, 8 and fast algorithm
+        factor = 1
+        while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]:
+            factor *=2
+        if factor > 1:
+            img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST)
+
+        #calculate the cropping box and get the cropped part
+        if fit:
+            x1 = y1 = 0
+            x2, y2 = img.size
+            wRatio = 1.0 * x2/box[0]
+            hRatio = 1.0 * y2/box[1]
+            if hRatio > wRatio:
+                y1 = y2/2-box[1]*wRatio/2
+                y2 = y2/2+box[1]*wRatio/2
+            else:
+                x1 = x2/2-box[0]*hRatio/2
+                x2 = x2/2+box[0]*hRatio/2
+            img = img.crop((x1,y1,x2,y2))
+
+        #Resize the image with best quality algorithm ANTI-ALIAS
+        img.thumbnail(box, Image.ANTIALIAS)
+
+        if out:
+            #save it into a file-like object
+            img.save(out, "JPEG", quality=75)
+        else:
+            return img
--- a/apps/converter/conf/settings.py
+++ b/apps/converter/conf/settings.py
@@ -14,9 +14,7 @@ register_settings(
        {'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use.  Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
        {'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
        #{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
-        {'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''},
-        {'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''},
-        {'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
-        {'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
+        #{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
+        #{'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
    ]
 )
--- a/apps/converter/exceptions.py
+++ b/apps/converter/exceptions.py
@@ -13,13 +13,6 @@ class UnknownFormat(ConvertError):
    pass


-class UnpaperError(ConvertError):
-    """
-    Raised by unpaper
-    """
-    pass
-
-
 class IdentifyError(ConvertError):
    """
    Raised by identify
--- a/apps/converter/literals.py
+++ b/apps/converter/literals.py
@@ -1,27 +1,10 @@
 from django.utils.translation import ugettext_lazy as _

-from converter.conf.settings import DEFAULT_OPTIONS
-from converter.conf.settings import LOW_QUALITY_OPTIONS
-from converter.conf.settings import HIGH_QUALITY_OPTIONS
-from converter.conf.settings import PRINT_QUALITY_OPTIONS
-
 DEFAULT_ZOOM_LEVEL = 100
 DEFAULT_ROTATION = 0
 DEFAULT_PAGE_NUMBER = 1
 DEFAULT_FILE_FORMAT = u'jpeg'

-QUALITY_DEFAULT = u'quality_default'
-QUALITY_LOW = u'quality_low'
-QUALITY_HIGH = u'quality_high'
-QUALITY_PRINT = u'quality_print'
-
-QUALITY_SETTINGS = {
-    QUALITY_DEFAULT: DEFAULT_OPTIONS,
-    QUALITY_LOW: LOW_QUALITY_OPTIONS,
-    QUALITY_HIGH: HIGH_QUALITY_OPTIONS,
-    QUALITY_PRINT: PRINT_QUALITY_OPTIONS
-}
-
 DIMENSION_SEPARATOR = u'x'

 TRANSFORMATION_RESIZE = u'resize'
--- a/apps/converter/utils.py
+++ b/apps/converter/utils.py
@@ -2,14 +2,6 @@ import os

 from django.core.exceptions import ImproperlyConfigured
 from django.utils.importlib import import_module
-
-try:
-    from python_magic import magic
-    USE_PYTHON_MAGIC = True
-except:
-    import mimetypes
-    mimetypes.init()
-    USE_PYTHON_MAGIC = False
    
    
 #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
@@ -82,30 +74,11 @@ def load_backend():
                raise # If there's some other error, this must be an error in Mayan itself.


-def get_mimetype(filepath):
+def cleanup(filename):
    """
-    Determine a file's mimetype by calling the system's libmagic
-    library via python-magic or fallback to use python's mimetypes
-    library
+    Tries to remove the given filename. Ignores non-existent files
    """
-    file_mimetype = u''
-    file_mime_encoding = u''
-    
-    if USE_PYTHON_MAGIC:
-        if os.path.exists(filepath):
-            try:
-                source = open(filepath, 'r')
-                mime = magic.Magic(mime=True)
-                file_mimetype = mime.from_buffer(source.read())
-                source.seek(0)
-                mime_encoding = magic.Magic(mime_encoding=True)
-                file_mime_encoding = mime_encoding.from_buffer(source.read())
-            finally:
-                if source:
-                    source.close()
-    else:
-        path, filename = os.path.split(filepath)
-        file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
-        
-    return file_mimetype, file_mime_encoding
-
+    try:
+        os.remove(filename)
+    except OSError:
+        pass
--- a/apps/documents/init.py
+++ b/apps/documents/init.py
@@ -2,6 +2,7 @@ from django.utils.translation import ugettext_lazy as _
 from django.core.urlresolvers import reverse
 from django.conf import settings

+from common.utils import validate_path
 from navigation.api import register_links, register_top_menu, \
    register_model_list_columns, register_multi_item_links, \
    register_sidebar_template
@@ -24,8 +25,25 @@ from documents.literals import HISTORY_DOCUMENT_CREATED, \
    HISTORY_DOCUMENT_EDITED, HISTORY_DOCUMENT_DELETED
 from documents.conf.settings import ZOOM_MAX_LEVEL
 from documents.conf.settings import ZOOM_MIN_LEVEL
+from documents.conf.settings import CACHE_PATH
 from documents.widgets import document_thumbnail

+# Document page links expressions
+def is_first_page(context):
+    return context['object'].page_number <= 1
+
+
+def is_last_page(context):
+    return context['object'].page_number >= context['object'].document.documentpage_set.count()
+    
+
+def is_min_zoom(context):
+    return context['zoom'] <= ZOOM_MIN_LEVEL
+
+
+def is_max_zoom(context):
+    return context['zoom'] >= ZOOM_MAX_LEVEL
+
 # Permission setup
 set_namespace_title('documents', _(u'Documents'))
 register_permission(PERMISSION_DOCUMENT_CREATE)
@@ -48,23 +66,6 @@ register_history_type(HISTORY_DOCUMENT_CREATED)
 register_history_type(HISTORY_DOCUMENT_EDITED)
 register_history_type(HISTORY_DOCUMENT_DELETED)

-
-# Document page links expressions
-def is_first_page(context):
-    return context['object'].page_number <= 1
-
-
-def is_last_page(context):
-    return context['object'].page_number >= context['object'].document.documentpage_set.count()
-    
-
-def is_min_zoom(context):
-    return context['zoom'] <= ZOOM_MIN_LEVEL
-
-
-def is_max_zoom(context):
-    return context['zoom'] >= ZOOM_MAX_LEVEL
-
 document_list = {'text': _(u'all documents'), 'view': 'document_list', 'famfam': 'page', 'permissions': [PERMISSION_DOCUMENT_VIEW]}
 document_list_recent = {'text': _(u'recent documents'), 'view': 'document_list_recent', 'famfam': 'page', 'permissions': [PERMISSION_DOCUMENT_VIEW]}
 document_create_multiple = {'text': _(u'upload new documents'), 'view': 'document_create_multiple', 'famfam': 'page_add', 'permissions': [PERMISSION_DOCUMENT_CREATE]}
@@ -198,3 +199,5 @@ register_sidebar_template(['document_type_list'], 'document_types_help.html')
 register_links(Document, [document_view_simple], menu_name='form_header', position=0)
 register_links(Document, [document_view_advanced], menu_name='form_header', position=1)
 register_links(Document, [document_history_view], menu_name='form_header')
+
+validate_path(CACHE_PATH)
--- a/apps/documents/conf/settings.py
+++ b/apps/documents/conf/settings.py
@@ -2,8 +2,10 @@

 import hashlib
 import uuid
+import os

 from django.utils.translation import ugettext_lazy as _
+from django.conf import settings

 from storage.backends.filebasedstorage import FileBasedStorage
 from smart_settings.api import register_settings
@@ -38,5 +40,7 @@ register_settings(
        {'name': u'ZOOM_MAX_LEVEL', 'global_name': u'DOCUMENTS_ZOOM_MAX_LEVEL', 'default': 200, 'description': _(u'Maximum amount in percent (%) to allow user to zoom in a document page interactively.')},
        {'name': u'ZOOM_MIN_LEVEL', 'global_name': u'DOCUMENTS_ZOOM_MIN_LEVEL', 'default': 50, 'description': _(u'Minimum amount in percent (%) to allow user to zoom out a document page interactively.')},
        {'name': u'ROTATION_STEP', 'global_name': u'DOCUMENTS_ROTATION_STEP', 'default': 90, 'description': _(u'Amount in degrees to rotate a document page per user interaction.')},
+        #
+        {'name': u'CACHE_PATH', 'global_name': u'DOCUMENTS_CACHE_PATH', 'default': os.path.join(settings.PROJECT_ROOT, 'image_cache'), 'exists': True},
    ]
 )
--- a/apps/documents/models.py
+++ b/apps/documents/models.py
@@ -1,11 +1,13 @@
 import os
 import tempfile
+import hashlib

 from django.db import models
 from django.utils.translation import ugettext_lazy as _
 from django.contrib.auth.models import User
 from django.contrib.contenttypes import generic
 from django.contrib.comments.models import Comment
+from django.conf import settings

 from python_magic import magic

@@ -13,12 +15,26 @@ from taggit.managers import TaggableManager
 from dynamic_search.api import register
 from converter.api import get_page_count
 from converter.api import get_available_transformations_choices
+from converter.api import create_image_cache_filename, convert
+from converter.exceptions import UnknownFormat, UnkownConvertError

 from documents.conf.settings import CHECKSUM_FUNCTION
 from documents.conf.settings import UUID_FUNCTION
 from documents.conf.settings import STORAGE_BACKEND
+from documents.conf.settings import PREVIEW_SIZE
+from documents.conf.settings import THUMBNAIL_SIZE
+from documents.conf.settings import CACHE_PATH
+
 from documents.managers import RecentDocumentManager, \
    DocumentPageTransformationManager
+from documents.utils import document_save_to_temp_dir
+from documents.literals import PICTURE_ERROR_SMALL, PICTURE_ERROR_MEDIUM, \
+    PICTURE_UNKNOWN_SMALL, PICTURE_UNKNOWN_MEDIUM
+from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
+    DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER
+    
+# document image cache name hash function
+HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()


 def get_filename_from_uuid(instance, filename):
@@ -201,8 +217,7 @@ class Document(models.Model):
        exists in storage
        """
        return self.file.storage.exists(self.file.path)
-    
-
+   
    def apply_default_transformations(self, transformations):
        #Only apply default transformations on new documents
        if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0:
@@ -216,6 +231,29 @@ class Document(models.Model):
                    )

                    page_transformation.save()
+                    
+    def get_image_cache_name(self, page):
+        document_page = self.documentpage_set.get(page_number=page)
+        transformations, warnings = document_page.get_transformation_list()
+        hash_value = HASH_FUNCTION(u''.join([self.checksum, unicode(page), unicode(transformations)]))
+        cache_file_path = os.path.join(CACHE_PATH, hash_value)
+        if os.path.exists(cache_file_path):
+            return cache_file_path
+        else:
+            document_file = document_save_to_temp_dir(self, self.checksum)
+            return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
+            
+    def get_image(self, size=PREVIEW_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION):
+        try:
+            image_cache_name = self.get_image_cache_name(page=page)
+            output_file = convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
+        except UnknownFormat:
+            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_SMALL)
+        except UnkownConvertError:    
+            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
+        except Exception, e:
+            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
+        return output_file


 class DocumentTypeFilename(models.Model):
--- a/apps/documents/templatetags/printing_tags.py
+++ b/apps/documents/templatetags/printing_tags.py
@@ -1,6 +1,6 @@
 from django.template import Library, Node, Variable

-from converter.api import get_document_dimensions, QUALITY_PRINT
+from converter.api import get_document_dimensions

 from documents.views import calculate_converter_arguments
 from documents.conf.settings import PRINT_SIZE
@@ -14,8 +14,7 @@ class GetImageSizeNode(Node):

    def render(self, context):
        document = Variable(self.document).resolve(context)
-        arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, quality=QUALITY_PRINT)
-        width, height = get_document_dimensions(document, **arguments)
+        width, height = get_document_dimensions(document)
        context[u'document_width'], context['document_height'] = width, height
        context[u'document_aspect'] = float(width) / float(height)
        return u''
--- a/apps/documents/urls.py
+++ b/apps/documents/urls.py
@@ -1,7 +1,5 @@
 from django.conf.urls.defaults import patterns, url

-from converter.literals import QUALITY_HIGH, QUALITY_PRINT
-
 from documents.conf.settings import PREVIEW_SIZE
 from documents.conf.settings import PRINT_SIZE
 from documents.conf.settings import THUMBNAIL_SIZE
@@ -24,8 +22,8 @@ urlpatterns = patterns('documents.views',
    url(r'^(?P<document_id>\d+)/display/preview/$', 'get_document_image', {'size': PREVIEW_SIZE}, 'document_preview'),
    url(r'^(?P<document_id>\d+)/display/preview/multipage/$', 'get_document_image', {'size': MULTIPAGE_PREVIEW_SIZE}, 'document_preview_multipage'),
    url(r'^(?P<document_id>\d+)/display/thumbnail/$', 'get_document_image', {'size': THUMBNAIL_SIZE}, 'document_thumbnail'),
-    url(r'^(?P<document_id>\d+)/display/$', 'get_document_image', {'size': DISPLAY_SIZE, 'quality': QUALITY_HIGH}, 'document_display'),
-    url(r'^(?P<document_id>\d+)/display/print/$', 'get_document_image', {'size': PRINT_SIZE, 'quality': QUALITY_PRINT}, 'document_display_print'),
+    url(r'^(?P<document_id>\d+)/display/$', 'get_document_image', {'size': DISPLAY_SIZE}, 'document_display'),
+    url(r'^(?P<document_id>\d+)/display/print/$', 'get_document_image', {'size': PRINT_SIZE}, 'document_display_print'),

    url(r'^(?P<document_id>\d+)/download/$', 'document_download', (), 'document_download'),
    url(r'^(?P<document_id>\d+)/create/siblings/$', 'document_create_siblings', (), 'document_create_siblings'),
--- a/apps/documents/utils.py
+++ b/apps/documents/utils.py
@@ -1,6 +1,6 @@
 import os

-from common import TEMPORARY_DIRECTORY
+from common.conf.settings import TEMPORARY_DIRECTORY


 #http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
--- a/apps/documents/views.py
+++ b/apps/documents/views.py
@@ -20,11 +20,8 @@ from common.widgets import two_state_template
 from common.literals import PAGE_SIZE_DIMENSIONS, \
    PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE
 from common.conf.settings import DEFAULT_PAPER_SIZE
-from converter.api import convert_document
-from converter.exceptions import UnkownConvertError, UnknownFormat
 from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
-    DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \
-    DEFAULT_PAGE_NUMBER
+    DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER
 from filetransfers.api import serve_file
 from grouping.utils import get_document_group_subtemplate
 from metadata.api import save_metadata_list, \
@@ -287,7 +284,7 @@ def document_edit(request, document_id):
    }, context_instance=RequestContext(request))


-def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
+def get_document_image(request, document_id, size=PREVIEW_SIZE):
    check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])

    document = get_object_or_404(Document, pk=document_id)
@@ -304,36 +301,7 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_

    rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360

-    document_page = get_object_or_404(document.documentpage_set, page_number=page)
-    transformations, warnings = document_page.get_transformation_list()
-
-    if warnings and (request.user.is_staff or request.user.is_superuser):
-        for warning in warnings:
-            messages.warning(request, _(u'Page transformation error: %s') % warning)
-            
-    try:
-        output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations)
-    except UnkownConvertError, e:
-        if request.user.is_staff or request.user.is_superuser:
-            messages.error(request, e)
-        if size == THUMBNAIL_SIZE:
-            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
-        else:
-            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_MEDIUM)
-    except UnknownFormat:
-        if size == THUMBNAIL_SIZE:
-            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_SMALL)
-        else:
-            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_MEDIUM)
-    except Exception, e:
-        if request.user.is_staff or request.user.is_superuser:
-            messages.error(request, e)
-        if size == THUMBNAIL_SIZE:
-            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
-        else:
-            output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_MEDIUM)
-    finally:
-        return sendfile.sendfile(request, output_file)
+    return sendfile.sendfile(request, document.get_image(size=size, page=page, zoom=zoom, rotation=rotation))


 def document_download(request, document_id):
@@ -804,13 +772,14 @@ def document_print(request, document_id):


 def document_hard_copy(request, document_id):
+    #TODO: FIXME
    check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])

    document = get_object_or_404(Document, pk=document_id)

    RecentDocument.objects.add_document_for_user(request.user, document)

-    arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, file_format=DEFAULT_FILE_FORMAT, quality=QUALITY_PRINT)
+    arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, file_format=DEFAULT_FILE_FORMAT)

    # Pre-generate
    convert_document(document, **arguments)
--- a/apps/ocr/api.py
+++ b/apps/ocr/api.py
@@ -9,18 +9,18 @@ import sys
 from django.utils.translation import ugettext as _
 from django.utils.importlib import import_module

-from common import TEMPORARY_DIRECTORY
+from common.conf.settings import TEMPORARY_DIRECTORY
 from converter.api import convert
 from documents.models import DocumentPage
-from documents.utils import document_save_to_temp_dir

 from ocr.conf.settings import TESSERACT_PATH
 from ocr.conf.settings import TESSERACT_LANGUAGE
-from ocr.exceptions import TesseractError
+from ocr.exceptions import TesseractError, UnpaperError
 from ocr.conf.settings import UNPAPER_PATH
 from ocr.parsers import parse_document_page
 from ocr.parsers.exceptions import ParserError, ParserUnknownFile
-from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT
+from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT, \
+    DEFAULT_OCR_FILE_EXTENSION


 def get_language_backend():
@@ -56,8 +56,10 @@ def run_tesseract(input_filename, lang=None):
    os.close(fd)
    ocr_output = os.extsep.join([filepath, u'txt'])
    command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
-    if lang is not None:
-        command += [u'-l', lang]
+    
+    # TODO: Tesseract 3.0 segfaults
+    #if lang is not None:
+    #    command.extend([u'-l', lang])

    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
    return_code = proc.wait()
@@ -67,7 +69,13 @@ def run_tesseract(input_filename, lang=None):
        cleanup(ocr_output)
        raise TesseractError(error_text)
        
-    return codecs.open(ocr_output, 'r', 'utf-8'), ocr_output
+    fd = codecs.open(ocr_output, 'r', 'utf-8')
+    text = fd.read().strip()
+    fd.close()
+    
+    os.unlink(filepath)    
+    
+    return text
    

 def do_document_ocr(queue_document):
@@ -82,36 +90,37 @@ def do_document_ocr(queue_document):
            parse_document_page(document_page)
        except (ParserError, ParserUnknownFile):
            # Fall back to doing visual OCR
-            transformations = []
-            document_transformations, warnings = document_page.get_transformation_list()
-            ocr_transformations, warnings = queue_document.get_transformation_list()
-            transformations.extend(document_transformations)
-            transformations.extend(ocr_transformations)
-
-            unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
-
-            document_filepath = os.path.join(TEMPORARY_DIRECTORY, document_page.document.uuid)
-            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
+            ##ocr_transformations, warnings = queue_document.get_transformation_list()
            
-            document.save_to_file(document_filepath)
+            document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number)
+            unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
+            unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
+
+            unpaper_input=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
+            execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)
+
+            #from PIL import Image, ImageOps
+            #im = Image.open(document_filepath)
+            ##if im.mode=='RGBA':
+            ##    im=im.convert('RGB')
+            ##im = im.convert('L')
+            #im = ImageOps.grayscale(im)
+            #im.save(unpaper_output_filepath)

-            transformed_filepath=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, page=document_page.page_number, transformations=transformations)
-            execute_unpaper(input_filepath=transformed_filepath, output_filepath=unpaper_output_filepath)
            # Convert to TIFF
            pre_ocr_filepath = output_filepath=convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
            # Tesseract needs an explicit file extension
-            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_FORMAT])
+            pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
            os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
            try:
-                fd, ocr_output = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)
-                document_page.content = ocr_cleanup(fd.read().strip())
+                ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)
+
+                document_page.content = ocr_cleanup(ocr_text)
                document_page.page_label = _(u'Text from OCR')
                document_page.save()
-                fd.close()
-                cleanup(ocr_output)
            finally:
                cleanup(pre_ocr_filepath_w_ext)
-                cleanup(transformed_filepath)
+                cleanup(unpaper_input)
                cleanup(document_filepath)
                cleanup(unpaper_output_filepath)

@@ -155,6 +164,7 @@ def execute_unpaper(input_filepath, output_filepath):
    command = []
    command.append(UNPAPER_PATH)
    command.append(u'--overwrite')
+    command.append(u'--no-multi-pages')
    command.append(input_filepath)
    command.append(output_filepath)
    proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)
--- a/apps/ocr/exceptions.py
+++ b/apps/ocr/exceptions.py
@@ -4,3 +4,10 @@ class AlreadyQueued(Exception):

 class TesseractError(Exception):
    pass
+
+
+class UnpaperError(Exception):
+    """
+    Raised by unpaper
+    """
+    pass
--- a/apps/ocr/literals.py
+++ b/apps/ocr/literals.py
@@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = (
    (QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
 )

-DEFAULT_OCR_FILE_FORMAT = u'tif'
-UNPAPER_FILE_FORMAT = u'pnm'
+DEFAULT_OCR_FILE_FORMAT = u'tiff'
+DEFAULT_OCR_FILE_EXTENSION = u'tif'
+UNPAPER_FILE_FORMAT = u'ppm'