Merge branch 'document_image_caching' into smart_staging

This commit is contained in:
Roberto Rosario
2011-07-21 03:50:19 -04:00
20 changed files with 289 additions and 195 deletions

View File

@@ -2,11 +2,10 @@ import tempfile
from django.utils.translation import ugettext_lazy as _
from common.conf import settings as common_settings
from navigation.api import register_links
TEMPORARY_DIRECTORY = common_settings.TEMPORARY_DIRECTORY \
if common_settings.TEMPORARY_DIRECTORY else tempfile.mkdtemp()
from common.conf import settings as common_settings
from common.utils import validate_path
def has_usable_password(context):
@@ -17,3 +16,6 @@ current_user_details = {'text': _(u'user details'), 'view': 'current_user_detail
current_user_edit = {'text': _(u'edit details'), 'view': 'current_user_edit', 'famfam': 'vcard_edit'}
register_links(['current_user_details', 'current_user_edit', 'password_change_view'], [current_user_details, current_user_edit, password_change_view], menu_name='secondary_menu')
if (validate_path(common_settings.TEMPORARY_DIRECTORY) == False) or (not common_settings.TEMPORARY_DIRECTORY):
setattr(common_settings, 'TEMPORARY_DIRECTORY', tempfile.mkdtemp())

View File

@@ -2,6 +2,7 @@
import os
import re
import types
import tempfile
from django.utils.http import urlquote as django_urlquote
from django.utils.http import urlencode as django_urlencode
@@ -12,6 +13,15 @@ from django.contrib.contenttypes.models import ContentType
from django.contrib.auth.models import User
try:
from python_magic import magic
USE_PYTHON_MAGIC = True
except:
import mimetypes
mimetypes.init()
USE_PYTHON_MAGIC = False
def urlquote(link=None, get=None):
u'''
This method does both: urlquote() and urlencode()
@@ -337,3 +347,50 @@ def return_diff(old_obj, new_obj, attrib_list=None):
}
return diff_dict
def get_mimetype(filepath):
"""
Determine a file's mimetype by calling the system's libmagic
library via python-magic or fallback to use python's mimetypes
library
"""
file_mimetype = u''
file_mime_encoding = u''
if USE_PYTHON_MAGIC:
if os.path.exists(filepath):
try:
source = open(filepath, 'r')
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(source.read())
source.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(source.read())
finally:
if source:
source.close()
else:
path, filename = os.path.split(filepath)
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
return file_mimetype, file_mime_encoding
def validate_path(path):
if os.path.exists(path) != True:
# If doesn't exist try to create it
try:
os.mkdir(path)
except:
return False
# Check if it is writable
try:
fd, test_filepath = tempfile.mkstemp(dir=path)
os.close(fd)
os.unlink(test_filepath)
except:
return False
return True

View File

@@ -2,14 +2,12 @@ import os
import subprocess
import hashlib
from common import TEMPORARY_DIRECTORY
from documents.utils import document_save_to_temp_dir
from common.conf.settings import TEMPORARY_DIRECTORY
from converter.conf.settings import UNOCONV_PATH
from converter.exceptions import UnpaperError, OfficeConversionError
from converter.exceptions import OfficeConversionError
from converter.literals import DEFAULT_PAGE_NUMBER, \
QUALITY_DEFAULT, DEFAULT_ZOOM_LEVEL, \
DEFAULT_ROTATION, DEFAULT_FILE_FORMAT, QUALITY_HIGH
DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, DEFAULT_FILE_FORMAT
from converter import backend
from converter.literals import TRANSFORMATION_CHOICES
@@ -17,6 +15,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR
from converter.utils import cleanup
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
@@ -24,15 +23,6 @@ CONVERTER_OFFICE_FILE_EXTENSIONS = [
u'ods', u'docx', u'doc'
]
def cleanup(filename):
"""
Tries to remove the given filename. Ignores non-existent files
"""
try:
os.remove(filename)
except OSError:
pass
def execute_unoconv(input_filepath, arguments=''):
"""
@@ -70,26 +60,19 @@ def convert_office_document(input_filepath):
return None
def convert_document(document, *args, **kwargs):
document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs)
if os.path.exists(document_filepath):
return document_filepath
return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
def convert(input_filepath, cleanup_files=True, *args, **kwargs):
def convert(input_filepath, output_filepath=None, cleanup_files=False, *args, **kwargs):
size = kwargs.get('size')
file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
rotation = kwargs.get('rotation', DEFAULT_ROTATION)
page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
quality = kwargs.get('quality', QUALITY_DEFAULT)
transformations = kwargs.get('transformations', [])
unoconv_output = None
output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
if output_filepath is None:
output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
if os.path.exists(output_filepath):
return output_filepath
@@ -125,7 +108,7 @@ def convert(input_filepath, cleanup_files=True, *args, **kwargs):
)
try:
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format)
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, transformations=transformations, page=page, file_format=file_format)
finally:
if cleanup_files:
cleanup(input_filepath)

View File

@@ -3,7 +3,6 @@ import re
from converter.conf.settings import GM_PATH
from converter.conf.settings import GM_SETTINGS
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError
from converter.backends import ConverterBase
@@ -31,8 +30,10 @@ class ConverterClass(ConverterBase):
raise IdentifyError(proc.stderr.readline())
return proc.stdout.read()
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
arguments = []
if transformations:
for transformation in transformations:
if transformation['transformation'] == TRANSFORMATION_RESIZE:
@@ -51,7 +52,7 @@ class ConverterClass(ConverterBase):
arguments.append(u'-rotate')
arguments.append(u'%s' % transformation['arguments']['degrees'])
if format == u'jpeg':
if file_format.lower() == u'jpeg' or file_format.lower() == u'jpg':
arguments.append(u'-quality')
arguments.append(u'85')
@@ -64,7 +65,6 @@ class ConverterClass(ConverterBase):
command = []
command.append(unicode(GM_PATH))
command.append(u'convert')
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.extend(unicode(GM_SETTINGS).split())
command.append(unicode(input_arg))
if arguments:

View File

@@ -3,7 +3,6 @@ import re
from converter.conf.settings import IM_IDENTIFY_PATH
from converter.conf.settings import IM_CONVERT_PATH
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from converter.exceptions import ConvertError, UnknownFormat, \
IdentifyError
from converter.backends import ConverterBase
@@ -30,7 +29,7 @@ class ConverterClass(ConverterBase):
raise IdentifyError(proc.stderr.readline())
return proc.stdout.read()
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
arguments = []
if transformations:
for transformation in transformations:
@@ -50,7 +49,7 @@ class ConverterClass(ConverterBase):
arguments.append(u'-rotate')
arguments.append(u'%s' % transformation['arguments']['degrees'])
if format == u'jpeg':
if file_format.lower() == u'jpeg' or file_format.lower() == u'jpg':
arguments.append(u'-quality')
arguments.append(u'85')
@@ -62,7 +61,6 @@ class ConverterClass(ConverterBase):
command = []
command.append(unicode(IM_CONVERT_PATH))
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(unicode(input_arg))
if arguments:
command.extend(arguments)

View File

@@ -1,16 +1,21 @@
import tempfile
import os
import slate
from PIL import Image
import ghostscript
from django.utils.translation import ugettext_lazy as _
from converter.literals import QUALITY_DEFAULT, QUALITY_SETTINGS
from common.utils import get_mimetype
from converter.exceptions import ConvertError, UnknownFormat, IdentifyError
from converter.backends import ConverterBase
from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
from converter.literals import QUALITY_DEFAULT, DEFAULT_PAGE_NUMBER, \
from converter.literals import DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
from converter.utils import get_mimetype
from converter.utils import cleanup
class ConverterClass(ConverterBase):
@@ -42,11 +47,45 @@ class ConverterClass(ConverterBase):
return page_count
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
def convert_file(self, input_filepath, output_filepath, transformations=None, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
tmpfile = None
mimetype, encoding = get_mimetype(input_filepath)
if mimetype == 'application/pdf':
# If file is a PDF open it with ghostscript and convert it to
# TIFF
first_page_tmpl = '-dFirstPage=%d' % page
last_page_tmpl = '-dLastPage=%d' % page
fd, tmpfile = tempfile.mkstemp()
os.close(fd)
output_file_tmpl = '-sOutputFile=%s' % tmpfile
input_file_tmpl = '-f%s' % input_filepath
args = [
'gs', '-q', '-dQUIET', '-dSAFER', '-dBATCH',
'-dNOPAUSE', '-dNOPROMPT',
first_page_tmpl, last_page_tmpl,
'-sDEVICE=jpeg', '-dJPEGQ=75',
'-r150', output_file_tmpl,
input_file_tmpl,
'-c "60000000 setvmthreshold"', # use 30MB
'-dNOGC', # No garbage collection
'-dMaxBitmap=500000000',
'-dAlignToPixels=0',
'-dGridFitTT=0',
'-dTextAlphaBits=4',
'-dGraphicsAlphaBits=4',
]
ghostscript.Ghostscript(*args)
page = 1 # Don't execute the following while loop
input_filepath = tmpfile
try:
im = Image.open(input_filepath)
except Exception: # Python Imaging Library doesn't recognize it as an image
raise UnknownFormat
finally:
if tmpfile:
cleanup(tmpfile)
current_page = 0
try:
@@ -58,12 +97,12 @@ class ConverterClass(ConverterBase):
pass # end of sequence
if transformations:
aspect = 1.0 * im.size[0] / im.size[1]
for transformation in transformations:
aspect = 1.0 * im.size[1] / im.size[0]
if transformation['transformation'] == TRANSFORMATION_RESIZE:
width = int(transformation['arguments']['width'])
height = int(transformation['arguments'].get('height', 1.0 * width * aspect))
im = im.resize((width, height), Image.ANTIALIAS)
im = self.resize(im, (width, height))
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
decimal_value = float(transformation['arguments']['percent']) / 100
im = im.transform((im.size[0] * decimal_value, im.size[1] * decimal_value), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
@@ -73,6 +112,7 @@ class ConverterClass(ConverterBase):
if im.mode not in ('L', 'RGB'):
im = im.convert('RGB')
im.save(output_filepath, format=file_format)
def get_format_list(self):
@@ -91,3 +131,41 @@ class ConverterClass(ConverterBase):
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, \
TRANSFORMATION_ZOOM
]
# From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
def resize(self, img, box, fit=False, out=None):
'''Downsample the image.
@param img: Image - an Image-object
@param box: tuple(x, y) - the bounding box of the result image
@param fit: boolean - crop the image to fill the box
@param out: file-like-object - save the image into the output stream
'''
#preresize image with factor 2, 4, 8 and fast algorithm
factor = 1
while img.size[0]/factor > 2*box[0] and img.size[1]*2/factor > 2*box[1]:
factor *=2
if factor > 1:
img.thumbnail((img.size[0]/factor, img.size[1]/factor), Image.NEAREST)
#calculate the cropping box and get the cropped part
if fit:
x1 = y1 = 0
x2, y2 = img.size
wRatio = 1.0 * x2/box[0]
hRatio = 1.0 * y2/box[1]
if hRatio > wRatio:
y1 = y2/2-box[1]*wRatio/2
y2 = y2/2+box[1]*wRatio/2
else:
x1 = x2/2-box[0]*hRatio/2
x2 = x2/2+box[0]*hRatio/2
img = img.crop((x1,y1,x2,y2))
#Resize the image with best quality algorithm ANTI-ALIAS
img.thumbnail(box, Image.ANTIALIAS)
if out:
#save it into a file-like object
img.save(out, "JPEG", quality=75)
else:
return img

View File

@@ -14,9 +14,7 @@ register_settings(
{'name': u'GRAPHICS_BACKEND', 'global_name': u'CONVERTER_GRAPHICS_BACKEND', 'default': u'converter.backends.python', 'description': _(u'Graphics conversion backend to use. Options are: converter.backends.imagemagick, converter.backends.graphicsmagick and converter.backends.python.')},
{'name': u'UNOCONV_PATH', 'global_name': u'CONVERTER_UNOCONV_PATH', 'default': u'/usr/bin/unoconv', 'exists': True},
#{'name': u'OCR_OPTIONS', 'global_name': u'CONVERTER_OCR_OPTIONS', 'default': u'-colorspace Gray -depth 8 -resample 200x200'},
{'name': u'DEFAULT_OPTIONS', 'global_name': u'CONVERTER_DEFAULT_OPTIONS', 'default': u''},
{'name': u'LOW_QUALITY_OPTIONS', 'global_name': u'CONVERTER_LOW_QUALITY_OPTIONS', 'default': u''},
{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
{'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
#{'name': u'HIGH_QUALITY_OPTIONS', 'global_name': u'CONVERTER_HIGH_QUALITY_OPTIONS', 'default': u'-density 400'},
#{'name': u'PRINT_QUALITY_OPTIONS', 'global_name': u'CONVERTER_PRINT_QUALITY_OPTIONS', 'default': u'-density 500'},
]
)

View File

@@ -13,13 +13,6 @@ class UnknownFormat(ConvertError):
pass
class UnpaperError(ConvertError):
"""
Raised by unpaper
"""
pass
class IdentifyError(ConvertError):
"""
Raised by identify

View File

@@ -1,27 +1,10 @@
from django.utils.translation import ugettext_lazy as _
from converter.conf.settings import DEFAULT_OPTIONS
from converter.conf.settings import LOW_QUALITY_OPTIONS
from converter.conf.settings import HIGH_QUALITY_OPTIONS
from converter.conf.settings import PRINT_QUALITY_OPTIONS
DEFAULT_ZOOM_LEVEL = 100
DEFAULT_ROTATION = 0
DEFAULT_PAGE_NUMBER = 1
DEFAULT_FILE_FORMAT = u'jpeg'
QUALITY_DEFAULT = u'quality_default'
QUALITY_LOW = u'quality_low'
QUALITY_HIGH = u'quality_high'
QUALITY_PRINT = u'quality_print'
QUALITY_SETTINGS = {
QUALITY_DEFAULT: DEFAULT_OPTIONS,
QUALITY_LOW: LOW_QUALITY_OPTIONS,
QUALITY_HIGH: HIGH_QUALITY_OPTIONS,
QUALITY_PRINT: PRINT_QUALITY_OPTIONS
}
DIMENSION_SEPARATOR = u'x'
TRANSFORMATION_RESIZE = u'resize'

View File

@@ -2,14 +2,6 @@ import os
from django.core.exceptions import ImproperlyConfigured
from django.utils.importlib import import_module
try:
from python_magic import magic
USE_PYTHON_MAGIC = True
except:
import mimetypes
mimetypes.init()
USE_PYTHON_MAGIC = False
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python
@@ -82,30 +74,11 @@ def load_backend():
raise # If there's some other error, this must be an error in Mayan itself.
def get_mimetype(filepath):
def cleanup(filename):
"""
Determine a file's mimetype by calling the system's libmagic
library via python-magic or fallback to use python's mimetypes
library
Tries to remove the given filename. Ignores non-existent files
"""
file_mimetype = u''
file_mime_encoding = u''
if USE_PYTHON_MAGIC:
if os.path.exists(filepath):
try:
source = open(filepath, 'r')
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(source.read())
source.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(source.read())
finally:
if source:
source.close()
else:
path, filename = os.path.split(filepath)
file_mimetype, file_mime_encoding = mimetypes.guess_type(filename)
return file_mimetype, file_mime_encoding
try:
os.remove(filename)
except OSError:
pass

View File

@@ -2,6 +2,7 @@ from django.utils.translation import ugettext_lazy as _
from django.core.urlresolvers import reverse
from django.conf import settings
from common.utils import validate_path
from navigation.api import register_links, register_top_menu, \
register_model_list_columns, register_multi_item_links, \
register_sidebar_template
@@ -24,8 +25,25 @@ from documents.literals import HISTORY_DOCUMENT_CREATED, \
HISTORY_DOCUMENT_EDITED, HISTORY_DOCUMENT_DELETED
from documents.conf.settings import ZOOM_MAX_LEVEL
from documents.conf.settings import ZOOM_MIN_LEVEL
from documents.conf.settings import CACHE_PATH
from documents.widgets import document_thumbnail
# Document page links expressions
def is_first_page(context):
return context['object'].page_number <= 1
def is_last_page(context):
return context['object'].page_number >= context['object'].document.documentpage_set.count()
def is_min_zoom(context):
return context['zoom'] <= ZOOM_MIN_LEVEL
def is_max_zoom(context):
return context['zoom'] >= ZOOM_MAX_LEVEL
# Permission setup
set_namespace_title('documents', _(u'Documents'))
register_permission(PERMISSION_DOCUMENT_CREATE)
@@ -48,23 +66,6 @@ register_history_type(HISTORY_DOCUMENT_CREATED)
register_history_type(HISTORY_DOCUMENT_EDITED)
register_history_type(HISTORY_DOCUMENT_DELETED)
# Document page links expressions
def is_first_page(context):
return context['object'].page_number <= 1
def is_last_page(context):
return context['object'].page_number >= context['object'].document.documentpage_set.count()
def is_min_zoom(context):
return context['zoom'] <= ZOOM_MIN_LEVEL
def is_max_zoom(context):
return context['zoom'] >= ZOOM_MAX_LEVEL
document_list = {'text': _(u'all documents'), 'view': 'document_list', 'famfam': 'page', 'permissions': [PERMISSION_DOCUMENT_VIEW]}
document_list_recent = {'text': _(u'recent documents'), 'view': 'document_list_recent', 'famfam': 'page', 'permissions': [PERMISSION_DOCUMENT_VIEW]}
document_create_multiple = {'text': _(u'upload new documents'), 'view': 'document_create_multiple', 'famfam': 'page_add', 'permissions': [PERMISSION_DOCUMENT_CREATE]}
@@ -198,3 +199,5 @@ register_sidebar_template(['document_type_list'], 'document_types_help.html')
register_links(Document, [document_view_simple], menu_name='form_header', position=0)
register_links(Document, [document_view_advanced], menu_name='form_header', position=1)
register_links(Document, [document_history_view], menu_name='form_header')
validate_path(CACHE_PATH)

View File

@@ -2,8 +2,10 @@
import hashlib
import uuid
import os
from django.utils.translation import ugettext_lazy as _
from django.conf import settings
from storage.backends.filebasedstorage import FileBasedStorage
from smart_settings.api import register_settings
@@ -38,5 +40,7 @@ register_settings(
{'name': u'ZOOM_MAX_LEVEL', 'global_name': u'DOCUMENTS_ZOOM_MAX_LEVEL', 'default': 200, 'description': _(u'Maximum amount in percent (%) to allow user to zoom in a document page interactively.')},
{'name': u'ZOOM_MIN_LEVEL', 'global_name': u'DOCUMENTS_ZOOM_MIN_LEVEL', 'default': 50, 'description': _(u'Minimum amount in percent (%) to allow user to zoom out a document page interactively.')},
{'name': u'ROTATION_STEP', 'global_name': u'DOCUMENTS_ROTATION_STEP', 'default': 90, 'description': _(u'Amount in degrees to rotate a document page per user interaction.')},
#
{'name': u'CACHE_PATH', 'global_name': u'DOCUMENTS_CACHE_PATH', 'default': os.path.join(settings.PROJECT_ROOT, 'image_cache'), 'exists': True},
]
)

View File

@@ -1,11 +1,13 @@
import os
import tempfile
import hashlib
from django.db import models
from django.utils.translation import ugettext_lazy as _
from django.contrib.auth.models import User
from django.contrib.contenttypes import generic
from django.contrib.comments.models import Comment
from django.conf import settings
from python_magic import magic
@@ -13,12 +15,26 @@ from taggit.managers import TaggableManager
from dynamic_search.api import register
from converter.api import get_page_count
from converter.api import get_available_transformations_choices
from converter.api import create_image_cache_filename, convert
from converter.exceptions import UnknownFormat, UnkownConvertError
from documents.conf.settings import CHECKSUM_FUNCTION
from documents.conf.settings import UUID_FUNCTION
from documents.conf.settings import STORAGE_BACKEND
from documents.conf.settings import PREVIEW_SIZE
from documents.conf.settings import THUMBNAIL_SIZE
from documents.conf.settings import CACHE_PATH
from documents.managers import RecentDocumentManager, \
DocumentPageTransformationManager
from documents.utils import document_save_to_temp_dir
from documents.literals import PICTURE_ERROR_SMALL, PICTURE_ERROR_MEDIUM, \
PICTURE_UNKNOWN_SMALL, PICTURE_UNKNOWN_MEDIUM
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER
# document image cache name hash function
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
def get_filename_from_uuid(instance, filename):
@@ -201,8 +217,7 @@ class Document(models.Model):
exists in storage
"""
return self.file.storage.exists(self.file.path)
def apply_default_transformations(self, transformations):
#Only apply default transformations on new documents
if reduce(lambda x, y: x + y, [page.documentpagetransformation_set.count() for page in self.documentpage_set.all()]) == 0:
@@ -216,6 +231,29 @@ class Document(models.Model):
)
page_transformation.save()
def get_image_cache_name(self, page):
document_page = self.documentpage_set.get(page_number=page)
transformations, warnings = document_page.get_transformation_list()
hash_value = HASH_FUNCTION(u''.join([self.checksum, unicode(page), unicode(transformations)]))
cache_file_path = os.path.join(CACHE_PATH, hash_value)
if os.path.exists(cache_file_path):
return cache_file_path
else:
document_file = document_save_to_temp_dir(self, self.checksum)
return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
def get_image(self, size=PREVIEW_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION):
try:
image_cache_name = self.get_image_cache_name(page=page)
output_file = convert(image_cache_name, cleanup_files=False, size=size, zoom=zoom, rotation=rotation)
except UnknownFormat:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_SMALL)
except UnkownConvertError:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
except Exception, e:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
return output_file
class DocumentTypeFilename(models.Model):

View File

@@ -1,6 +1,6 @@
from django.template import Library, Node, Variable
from converter.api import get_document_dimensions, QUALITY_PRINT
from converter.api import get_document_dimensions
from documents.views import calculate_converter_arguments
from documents.conf.settings import PRINT_SIZE
@@ -14,8 +14,7 @@ class GetImageSizeNode(Node):
def render(self, context):
document = Variable(self.document).resolve(context)
arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, quality=QUALITY_PRINT)
width, height = get_document_dimensions(document, **arguments)
width, height = get_document_dimensions(document)
context[u'document_width'], context['document_height'] = width, height
context[u'document_aspect'] = float(width) / float(height)
return u''

View File

@@ -1,7 +1,5 @@
from django.conf.urls.defaults import patterns, url
from converter.literals import QUALITY_HIGH, QUALITY_PRINT
from documents.conf.settings import PREVIEW_SIZE
from documents.conf.settings import PRINT_SIZE
from documents.conf.settings import THUMBNAIL_SIZE
@@ -24,8 +22,8 @@ urlpatterns = patterns('documents.views',
url(r'^(?P<document_id>\d+)/display/preview/$', 'get_document_image', {'size': PREVIEW_SIZE}, 'document_preview'),
url(r'^(?P<document_id>\d+)/display/preview/multipage/$', 'get_document_image', {'size': MULTIPAGE_PREVIEW_SIZE}, 'document_preview_multipage'),
url(r'^(?P<document_id>\d+)/display/thumbnail/$', 'get_document_image', {'size': THUMBNAIL_SIZE}, 'document_thumbnail'),
url(r'^(?P<document_id>\d+)/display/$', 'get_document_image', {'size': DISPLAY_SIZE, 'quality': QUALITY_HIGH}, 'document_display'),
url(r'^(?P<document_id>\d+)/display/print/$', 'get_document_image', {'size': PRINT_SIZE, 'quality': QUALITY_PRINT}, 'document_display_print'),
url(r'^(?P<document_id>\d+)/display/$', 'get_document_image', {'size': DISPLAY_SIZE}, 'document_display'),
url(r'^(?P<document_id>\d+)/display/print/$', 'get_document_image', {'size': PRINT_SIZE}, 'document_display_print'),
url(r'^(?P<document_id>\d+)/download/$', 'document_download', (), 'document_download'),
url(r'^(?P<document_id>\d+)/create/siblings/$', 'document_create_siblings', (), 'document_create_siblings'),

View File

@@ -1,6 +1,6 @@
import os
from common import TEMPORARY_DIRECTORY
from common.conf.settings import TEMPORARY_DIRECTORY
#http://stackoverflow.com/questions/123198/how-do-i-copy-a-file-in-python

View File

@@ -20,11 +20,8 @@ from common.widgets import two_state_template
from common.literals import PAGE_SIZE_DIMENSIONS, \
PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE
from common.conf.settings import DEFAULT_PAPER_SIZE
from converter.api import convert_document
from converter.exceptions import UnkownConvertError, UnknownFormat
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \
DEFAULT_PAGE_NUMBER
DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER
from filetransfers.api import serve_file
from grouping.utils import get_document_group_subtemplate
from metadata.api import save_metadata_list, \
@@ -287,7 +284,7 @@ def document_edit(request, document_id):
}, context_instance=RequestContext(request))
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
def get_document_image(request, document_id, size=PREVIEW_SIZE):
check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
document = get_object_or_404(Document, pk=document_id)
@@ -304,36 +301,7 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360
document_page = get_object_or_404(document.documentpage_set, page_number=page)
transformations, warnings = document_page.get_transformation_list()
if warnings and (request.user.is_staff or request.user.is_superuser):
for warning in warnings:
messages.warning(request, _(u'Page transformation error: %s') % warning)
try:
output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations)
except UnkownConvertError, e:
if request.user.is_staff or request.user.is_superuser:
messages.error(request, e)
if size == THUMBNAIL_SIZE:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
else:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_MEDIUM)
except UnknownFormat:
if size == THUMBNAIL_SIZE:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_SMALL)
else:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_UNKNOWN_MEDIUM)
except Exception, e:
if request.user.is_staff or request.user.is_superuser:
messages.error(request, e)
if size == THUMBNAIL_SIZE:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_SMALL)
else:
output_file = os.path.join(settings.MEDIA_ROOT, u'images', PICTURE_ERROR_MEDIUM)
finally:
return sendfile.sendfile(request, output_file)
return sendfile.sendfile(request, document.get_image(size=size, page=page, zoom=zoom, rotation=rotation))
def document_download(request, document_id):
@@ -804,13 +772,14 @@ def document_print(request, document_id):
def document_hard_copy(request, document_id):
#TODO: FIXME
check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
document = get_object_or_404(Document, pk=document_id)
RecentDocument.objects.add_document_for_user(request.user, document)
arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, file_format=DEFAULT_FILE_FORMAT, quality=QUALITY_PRINT)
arguments, warnings = calculate_converter_arguments(document, size=PRINT_SIZE, file_format=DEFAULT_FILE_FORMAT)
# Pre-generate
convert_document(document, **arguments)

View File

@@ -9,18 +9,18 @@ import sys
from django.utils.translation import ugettext as _
from django.utils.importlib import import_module
from common import TEMPORARY_DIRECTORY
from common.conf.settings import TEMPORARY_DIRECTORY
from converter.api import convert
from documents.models import DocumentPage
from documents.utils import document_save_to_temp_dir
from ocr.conf.settings import TESSERACT_PATH
from ocr.conf.settings import TESSERACT_LANGUAGE
from ocr.exceptions import TesseractError
from ocr.exceptions import TesseractError, UnpaperError
from ocr.conf.settings import UNPAPER_PATH
from ocr.parsers import parse_document_page
from ocr.parsers.exceptions import ParserError, ParserUnknownFile
from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT
from ocr.literals import DEFAULT_OCR_FILE_FORMAT, UNPAPER_FILE_FORMAT, \
DEFAULT_OCR_FILE_EXTENSION
def get_language_backend():
@@ -56,8 +56,10 @@ def run_tesseract(input_filename, lang=None):
os.close(fd)
ocr_output = os.extsep.join([filepath, u'txt'])
command = [unicode(TESSERACT_PATH), unicode(input_filename), unicode(filepath)]
if lang is not None:
command += [u'-l', lang]
# TODO: Tesseract 3.0 segfaults
#if lang is not None:
# command.extend([u'-l', lang])
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()
@@ -67,7 +69,13 @@ def run_tesseract(input_filename, lang=None):
cleanup(ocr_output)
raise TesseractError(error_text)
return codecs.open(ocr_output, 'r', 'utf-8'), ocr_output
fd = codecs.open(ocr_output, 'r', 'utf-8')
text = fd.read().strip()
fd.close()
os.unlink(filepath)
return text
def do_document_ocr(queue_document):
@@ -82,36 +90,37 @@ def do_document_ocr(queue_document):
parse_document_page(document_page)
except (ParserError, ParserUnknownFile):
# Fall back to doing visual OCR
transformations = []
document_transformations, warnings = document_page.get_transformation_list()
ocr_transformations, warnings = queue_document.get_transformation_list()
transformations.extend(document_transformations)
transformations.extend(ocr_transformations)
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
document_filepath = os.path.join(TEMPORARY_DIRECTORY, document_page.document.uuid)
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
##ocr_transformations, warnings = queue_document.get_transformation_list()
document.save_to_file(document_filepath)
document_filepath = document_page.document.get_image_cache_name(page=document_page.page_number)
unpaper_output_filename = u'%s_unpaper_out_page_%s%s%s' % (document_page.document.uuid, document_page.page_number, os.extsep, UNPAPER_FILE_FORMAT)
unpaper_output_filepath = os.path.join(TEMPORARY_DIRECTORY, unpaper_output_filename)
unpaper_input=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT)
execute_unpaper(input_filepath=unpaper_input, output_filepath=unpaper_output_filepath)
#from PIL import Image, ImageOps
#im = Image.open(document_filepath)
##if im.mode=='RGBA':
## im=im.convert('RGB')
##im = im.convert('L')
#im = ImageOps.grayscale(im)
#im.save(unpaper_output_filepath)
transformed_filepath=convert(document_filepath, file_format=UNPAPER_FILE_FORMAT, page=document_page.page_number, transformations=transformations)
execute_unpaper(input_filepath=transformed_filepath, output_filepath=unpaper_output_filepath)
# Convert to TIFF
pre_ocr_filepath = output_filepath=convert(input_filepath=unpaper_output_filepath, file_format=DEFAULT_OCR_FILE_FORMAT)
# Tesseract needs an explicit file extension
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_FORMAT])
pre_ocr_filepath_w_ext = os.extsep.join([pre_ocr_filepath, DEFAULT_OCR_FILE_EXTENSION])
os.rename(pre_ocr_filepath, pre_ocr_filepath_w_ext)
try:
fd, ocr_output = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)
document_page.content = ocr_cleanup(fd.read().strip())
ocr_text = run_tesseract(pre_ocr_filepath_w_ext, TESSERACT_LANGUAGE)
document_page.content = ocr_cleanup(ocr_text)
document_page.page_label = _(u'Text from OCR')
document_page.save()
fd.close()
cleanup(ocr_output)
finally:
cleanup(pre_ocr_filepath_w_ext)
cleanup(transformed_filepath)
cleanup(unpaper_input)
cleanup(document_filepath)
cleanup(unpaper_output_filepath)
@@ -155,6 +164,7 @@ def execute_unpaper(input_filepath, output_filepath):
command = []
command.append(UNPAPER_PATH)
command.append(u'--overwrite')
command.append(u'--no-multi-pages')
command.append(input_filepath)
command.append(output_filepath)
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE)

View File

@@ -4,3 +4,10 @@ class AlreadyQueued(Exception):
class TesseractError(Exception):
pass
class UnpaperError(Exception):
"""
Raised by unpaper
"""
pass

View File

@@ -20,5 +20,6 @@ QUEUEDOCUMENT_STATE_CHOICES = (
(QUEUEDOCUMENT_STATE_ERROR, _(u'error')),
)
DEFAULT_OCR_FILE_FORMAT = u'tif'
UNPAPER_FILE_FORMAT = u'pnm'
DEFAULT_OCR_FILE_FORMAT = u'tiff'
DEFAULT_OCR_FILE_EXTENSION = u'tif'
UNPAPER_FILE_FORMAT = u'ppm'