Source, document page and thumbnails working, new document transformations and OCR yet to convert

This commit is contained in:
Roberto Rosario
2011-07-15 20:25:49 -04:00
parent 743ae0fce0
commit 389253385c
7 changed files with 137 additions and 99 deletions

View File

@@ -1,5 +1,6 @@
import os
import subprocess
import hashlib
from django.utils.importlib import import_module
from django.template.defaultfilters import slugify
@@ -22,6 +23,7 @@ from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR
HASH_FUNCTION = lambda x: hashlib.sha256(x).hexdigest()
CONVERTER_OFFICE_FILE_EXTENSIONS = [
u'ods', u'docx', u'doc'
@@ -75,19 +77,11 @@ def cache_cleanup(input_filepath, *args, **kwargs):
def create_image_cache_filename(input_filepath, *args, **kwargs):
if input_filepath:
temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
final_filepath = []
[final_filepath.append(str(arg)) for arg in args]
final_filepath.extend([u'%s_%s' % (key, value) for key, value in kwargs.items()])
temp_path += slugify(u'_'.join(final_filepath))
return temp_path
hash_value = HASH_FUNCTION(u''.join([input_filepath, unicode(args), unicode(kwargs)]))
return os.path.join(TEMPORARY_DIRECTORY, hash_value)
else:
return None
def convert_office_document(input_filepath):
if os.path.exists(UNOCONV_PATH):
@@ -104,21 +98,21 @@ def convert_document(document, *args, **kwargs):
return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
def convert(input_filepath, *args, **kwargs):
def convert(input_filepath, cleanup_files=True, *args, **kwargs):
size = kwargs.get('size')
file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
rotation = kwargs.get('rotation', DEFAULT_ROTATION)
page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
cleanup_files = kwargs.get('cleanup_files', True)
quality = kwargs.get('quality', QUALITY_DEFAULT)
transformations = kwargs.get('transformations', [])
unoconv_output = None
output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
#if os.path.exists(output_filepath):
# return output_filepath
print 'output_filepath', output_filepath
if os.path.exists(output_filepath):
return output_filepath
path, extension = os.path.splitext(input_filepath)
if extension[1:].lower() in CONVERTER_OFFICE_FILE_EXTENSIONS:
@@ -128,8 +122,6 @@ def convert(input_filepath, *args, **kwargs):
input_filepath = result
extra_options = u''
#TODO: not here in the backend
input_arg = u'%s[%s]' % (input_filepath, page-1)
transformations.append(
{
'transformation': TRANSFORMATION_RESIZE,
@@ -154,7 +146,7 @@ def convert(input_filepath, *args, **kwargs):
)
try:
backend.convert_file(input_filepath=input_arg, output_filepath=u'%s:%s' % (file_format, output_filepath), quality=quality, transformations=transformations)
backend.convert_file(input_filepath=input_filepath, output_filepath=output_filepath, quality=quality, transformations=transformations, page=page, file_format=file_format)
finally:
if cleanup_files:
cleanup(input_filepath)
@@ -189,14 +181,12 @@ def convert_document_for_ocr(document, page=DEFAULT_PAGE_NUMBER, file_format=DEF
unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)
input_arg = u'%s[%s]' % (input_filepath, page-1)
try:
document_page = document.documentpage_set.get(page_number=page + 1)
document_page = document.documentpage_set.get(page_number=page)
transformation_string, warnings = document_page.get_transformation_string()
#Apply default transformations
backend.convert_file(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file)
backend.convert_file(input_filepath=input_filepath, page=page, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file)
#Do OCR operations
backend.convert_file(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
# Process by unpaper

View File

@@ -11,7 +11,8 @@ from converter.backends import ConverterBase
from converter.literals import TRANSFORMATION_RESIZE, \
TRANSFORMATION_ROTATE, TRANSFORMATION_DENSITY, \
TRANSFORMATION_ZOOM
from converter.literals import DIMENSION_SEPARATOR
from converter.literals import DIMENSION_SEPARATOR, DEFAULT_PAGE_NUMBER, \
DEFAULT_FILE_FORMAT
CONVERTER_ERROR_STRING_NO_DECODER = u'No decode delegate for this image format'
CONVERTER_ERROR_STARTS_WITH = u'starts with'
@@ -32,10 +33,12 @@ class ConverterClass(ConverterBase):
return proc.stdout.read()
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT):
def convert_file(self, input_filepath, output_filepath, transformations=None, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
print 'convert_file'
arguments = []
if transformations:
for transformation in transformations:
print 'transformation: %s' % transformation
if transformation['transformation'] == TRANSFORMATION_RESIZE:
dimensions = []
dimensions.append(unicode(transformation['arguments']['width']))
@@ -46,21 +49,31 @@ class ConverterClass(ConverterBase):
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
arguments.append(u'-resize')
arguments.append(u'%d%%' % transformation['arguments']['zoom'])
arguments.append(u'%d%%' % transformation['arguments']['percent'])
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
print 'Do rotate'
arguments.append(u'-rotate')
arguments.append(u'%s' % transformation['arguments']['degrees'])
print 'arguments: %s' % arguments
#if format == u'jpg':
# extra_options += u' -quality 85'
if format == u'jpg':
arguments.append(u'-quality')
arguments.append(u'85')
# Graphicsmagick page number is 0 base
input_arg = u'%s[%d]' % (input_filepath, page - 1)
# Specify the file format next to the output filename
output_filepath = u'%s:%s' % (file_format, output_filepath)
command = []
command.append(unicode(GM_PATH))
command.append(u'convert')
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.extend(unicode(GM_SETTINGS).split())
command.append(unicode(input_filepath))
command.append(unicode(input_arg))
if arguments:
command.extend(arguments)
command.append(unicode(output_filepath))
@@ -115,10 +128,3 @@ class ConverterClass(ConverterBase):
except:
#TODO: send to other page number identifying program
return 1
def _get_transformation_string():
pass
#'command_line': u'-rotate %(degrees)d'
# }
#}

View File

@@ -31,15 +31,42 @@ class ConverterClass(ConverterBase):
return proc.stdout.read()
def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, arguments=None):
#if format == u'jpg':
# extra_options += u' -quality 85'
def convert_file(self, input_filepath, output_filepath, quality=QUALITY_DEFAULT, page=DEFAULT_PAGE_NUMBER, file_format=DEFAULT_FILE_FORMAT):
arguments = []
if transformations:
for transformation in transformations:
if transformation['transformation'] == TRANSFORMATION_RESIZE:
dimensions = []
dimensions.append(unicode(transformation['arguments']['width']))
if 'height' in transformation['arguments']:
dimensions.append(unicode(transformation['arguments']['height']))
arguments.append(u'-resize')
arguments.append(u'%s' % DIMENSION_SEPARATOR.join(dimensions))
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
arguments.append(u'-resize')
arguments.append(u'%d%%' % transformation['arguments']['percent'])
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
arguments.append(u'-rotate')
arguments.append(u'%s' % transformation['arguments']['degrees'])
if format == u'jpg':
arguments.append(u'-quality')
arguments.append(u'85')
# Imagemagick page number is 0 base
input_arg = u'%s[%d]' % (input_filepath, page - 1)
# Specify the file format next to the output filename
output_filepath = u'%s:%s' % (file_format, output_filepath)
command = []
command.append(unicode(IM_CONVERT_PATH))
command.extend(unicode(QUALITY_SETTINGS[quality]).split())
command.append(unicode(input_filepath))
command.append(unicode(input_arg))
if arguments:
command.extend(unicode(arguments).split())
command.extend(arguments)
command.append(unicode(output_filepath))
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait()

View File

@@ -20,4 +20,17 @@ class DocumentPageTransformationManager(models.Manager):
return self.model.objects.filter(document_page=document_page)
def get_for_document_page_as_list(self, document_page):
return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments')])
warnings = []
transformations = []
for transformation in self.get_for_document_page(document_page).values('transformation', 'arguments'):
try:
transformations.append(
{
'transformation': transformation['transformation'],
'arguments': eval(transformation['arguments'], {})
}
)
except Exception, e:
warnings.append(e)
return transformations, warnings

View File

@@ -20,10 +20,11 @@ from common.widgets import two_state_template
from common.literals import PAGE_SIZE_DIMENSIONS, \
PAGE_ORIENTATION_PORTRAIT, PAGE_ORIENTATION_LANDSCAPE
from common.conf.settings import DEFAULT_PAPER_SIZE
from converter.api import convert_document, QUALITY_DEFAULT
from converter.api import convert_document
from converter.exceptions import UnkownConvertError, UnknownFormat
from converter.api import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
DEFAULT_FILE_FORMAT, QUALITY_PRINT
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION, \
DEFAULT_FILE_FORMAT, QUALITY_PRINT, QUALITY_DEFAULT, \
DEFAULT_PAGE_NUMBER
from filetransfers.api import serve_file
from grouping.utils import get_document_group_subtemplate
from metadata.api import save_metadata_list, \
@@ -285,39 +286,15 @@ def document_edit(request, document_id):
'object': document,
}, context_instance=RequestContext(request))
'''
def calculate_converter_arguments(document, *args, **kwargs):
size = kwargs.pop('size', PREVIEW_SIZE)
quality = kwargs.pop('quality', QUALITY_DEFAULT)
page = kwargs.pop('page', 1)
file_format = kwargs.pop('file_format', DEFAULT_FILE_FORMAT)
zoom = kwargs.pop('zoom', DEFAULT_ZOOM_LEVEL)
rotation = kwargs.pop('rotation', DEFAULT_ROTATION)
document_page = DocumentPage.objects.get(document=document, page_number=page)
transformation_string, warnings = document_page.get_transformation_string()
arguments = {
'size': size,
'file_format': file_format,
'quality': quality,
'extra_options': transformation_string,
'page': page - 1,
'zoom': zoom,
'rotation': rotation
}
return arguments, warnings
'''
def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_DEFAULT):
check_permissions(request.user, [PERMISSION_DOCUMENT_VIEW])
document = get_object_or_404(Document, pk=document_id)
page = int(request.GET.get('page', 1))
page = int(request.GET.get('page', DEFAULT_PAGE_NUMBER))
zoom = int(request.GET.get('zoom', 100))
zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL))
if zoom < ZOOM_MIN_LEVEL:
zoom = ZOOM_MIN_LEVEL
@@ -325,18 +302,16 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE, quality=QUALITY_
if zoom > ZOOM_MAX_LEVEL:
zoom = ZOOM_MAX_LEVEL
rotation = int(request.GET.get('rotation', 0)) % 360
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360
#arguments, warnings = calculate_converter_arguments(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation)
#if warnings and (request.user.is_staff or request.user.is_superuser):
# for warning in warnings:
# messages.warning(request, _(u'Page transformation error: %s') % warning)
transformations = DocumentPageTransformation.objects.get_for_document_page_as_list(document)
document_page = get_object_or_404(document.documentpage_set, page_number=page)
transformations, warnings = DocumentPageTransformation.objects.get_for_document_page_as_list(document_page)
if warnings and (request.user.is_staff or request.user.is_superuser):
for warning in warnings:
messages.warning(request, _(u'Page transformation error: %s') % warning)
try:
#output_file = convert_document(document, **arguments)
output_file = convert_document(document, size=size, file_format=DEFAULT_FILE_FORMAT, quality=quality, page=page, zoom=zoom, rotation=rotation, transformations=transformations)
except UnkownConvertError, e:
if request.user.is_staff or request.user.is_superuser:
@@ -595,13 +570,13 @@ def document_page_view(request, document_page_id):
document_page = get_object_or_404(DocumentPage, pk=document_page_id)
zoom = int(request.GET.get('zoom', 100))
rotation = int(request.GET.get('rotation', 0))
zoom = int(request.GET.get('zoom', DEFAULT_ZOOM_LEVEL))
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION))
document_page_form = DocumentPageForm(instance=document_page, zoom=zoom, rotation=rotation)
base_title = _(u'details for: %s') % document_page
if zoom != 100:
if zoom != DEFAULT_ZOOM_LEVEL:
zoom_text = u'(%d%%)' % zoom
else:
zoom_text = u''

View File

@@ -7,5 +7,21 @@ class SourceTransformationManager(models.Manager):
ct = ContentType.objects.get_for_model(obj)
return self.model.objects.filter(content_type=ct).filter(object_id=obj.pk)
#def get_for_object_as_list(self, obj):
# return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_object(obj).values('transformation', 'arguments')])
def get_for_object_as_list(self, obj):
return list([{'transformation': transformation['transformation'], 'arguments': eval(transformation['arguments'])} for transformation in self.get_for_object(obj).values('transformation', 'arguments')])
warnings = []
transformations = []
for transformation in self.get_for_object(obj).values('transformation', 'arguments'):
try:
transformations.append(
{
'transformation': transformation['transformation'],
'arguments': eval(transformation['arguments'], {})
}
)
except Exception, e:
warnings.append(e)
return transformations, warnings

View File

@@ -285,9 +285,11 @@ def staging_file_preview(request, source_type, source_id, staging_file_id):
staging_folder = get_object_or_404(StagingFolder, pk=source_id)
StagingFile = create_staging_file_class(request, staging_folder.folder_path)
try:
transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder)
output_file, errors = StagingFile.get(staging_file_id).preview(
preview_size=staging_folder.get_preview_size(),
transformations=SourceTransformation.objects.get_for_object_as_list(staging_folder)
transformations=transformations
)
if errors and (request.user.is_staff or request.user.is_superuser):
for error in errors:
@@ -321,9 +323,10 @@ def staging_file_delete(request, source_type, source_id, staging_file_id):
if request.method == 'POST':
try:
transformations, errors=SourceTransformation.objects.get_for_object_as_list(staging_folder)
staging_file.delete(
preview_size=staging_folder.get_preview_size(),
transformations=SourceTransformation.objects.get_for_object_as_list(staging_folder)
transformations=transformations
)
messages.success(request, _(u'Staging file delete successfully.'))
except Exception, e:
@@ -516,12 +519,16 @@ def setup_source_transformation_edit(request, transformation_id):
if form.is_valid():
try:
# Test the validity of the argument field
eval(form.cleaned_data['arguments'])
form.save()
messages.success(request, _(u'Source transformation edited successfully'))
return HttpResponseRedirect(next)
except Exception, e:
messages.error(request, _(u'Error editing source transformation; %s') % e)
eval(form.cleaned_data['arguments'], {})
except:
messages.error(request, _(u'Source transformation argument error.'))
else:
try:
form.save()
messages.success(request, _(u'Source transformation edited successfully'))
return HttpResponseRedirect(next)
except Exception, e:
messages.error(request, _(u'Error editing source transformation; %s') % e)
else:
form = SourceTransformationForm(instance=source_transformation)
@@ -607,14 +614,18 @@ def setup_source_transformation_create(request, source_type, source_id):
if form.is_valid():
try:
# Test the validity of the argument field
eval(form.cleaned_data['arguments'])
source_tranformation = form.save(commit=False)
source_tranformation.content_object = source
source_tranformation.save()
messages.success(request, _(u'Source transformation created successfully'))
return HttpResponseRedirect(redirect_view)
except Exception, e:
messages.error(request, _(u'Error creating source transformation; %s') % e)
eval(form.cleaned_data['arguments'], {})
except:
messages.error(request, _(u'Source transformation argument error.'))
else:
try:
source_tranformation = form.save(commit=False)
source_tranformation.content_object = source
source_tranformation.save()
messages.success(request, _(u'Source transformation created successfully'))
return HttpResponseRedirect(redirect_view)
except Exception, e:
messages.error(request, _(u'Error creating source transformation; %s') % e)
else:
form = SourceTransformationForm_create()