Further converter refactor and initial move from document-centric to page-centric image generation. Issue #93.

This commit is contained in:
Roberto Rosario
2015-06-06 06:26:44 -04:00
parent de6182aea0
commit f4752a3f3f
7 changed files with 105 additions and 207 deletions

View File

@@ -40,21 +40,17 @@ class Python(ConverterBase):
def get_page_count(self, file_object, mimetype=None):
page_count = 1
#file_object, input_filepath = mkstemp()
#file_object.write(input_data)
if not mimetype:
#mimetype, encoding = get_mimetype(file_description=open(input_filepath, 'rb'), filepath=None, mimetype_only=True)
mimetype, encoding = get_mimetype(file_object=file_object, mimetype_only=True)
else:
encoding = None
if mimetype == 'application/pdf':
# If file is a PDF open it with slate to determine the page count
#with open(input_filepath) as fd:
try:
pages = slate.PDF(file_object)
except:
except Exception as exception:
logger.error('slate exception; %s', exception)
return 1
# TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description
else:
@@ -63,7 +59,6 @@ class Python(ConverterBase):
file_object.seek(0)
try:
#im = Image.fromarray(input_data)
image = Image.open(file_object)
except IOError: # cannot identify image file
raise UnknownFileFormat
@@ -81,47 +76,23 @@ class Python(ConverterBase):
return page_count
def convert(self, file_object, mimetype=None, output_format=DEFAULT_FILE_FORMAT, page=DEFAULT_PAGE_NUMBER):
#tmpfile = None
#mimetype = kwargs.get('mimetype', None)
if not mimetype:
mimetype, encoding = get_mimetype(file_object=file_object, mimetype_only=True)
##try:
print "MIME!", mimetype
if mimetype == 'application/pdf' and pdftoppm:
image_buffer = io.BytesIO()
new_file_object, input_filepath = tempfile.mkstemp()
os.write(new_file_object, file_object.read())
#file_object.seek(0)
#new_file_object.seek(0)
os.close(new_file_object)
pdftoppm(input_filepath, f=page, l=page, _out=image_buffer)
image_buffer.seek(0)
image = Image.open(image_buffer)
# TODO: remove input_filepath
fs_cleanup(input_filepath)
else:
image = Image.open(file_object)
##except Exception as exception:
## logger.error('Error converting image; %s', exception)
## # Python Imaging Library doesn't recognize it as an image
## raise ConvertError
##except IOError: # cannot identify image file
## raise UnknownFileFormat
#finally:
# if tmpfile:
# fs_cleanup(tmpfile)
current_page = 0
try:
while current_page == page - 1:
@@ -132,36 +103,35 @@ class Python(ConverterBase):
# end of sequence
pass
'''
try:
if transformations:
aspect = 1.0 * im.size[0] / im.size[1]
for transformation in transformations:
arguments = transformation.get('arguments')
if transformation['transformation'] == TRANSFORMATION_RESIZE:
width = int(arguments.get('width', 0))
height = int(arguments.get('height', 1.0 * width * aspect))
im = self.resize(im, (width, height))
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
decimal_value = float(arguments.get('percent', 100)) / 100
im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
# PIL counter degress counter-clockwise, reverse them
im = im.rotate(360 - arguments.get('degrees', 0))
except:
# Ignore all transformation error
pass
'''
if image.mode not in ('L', 'RGB'):
image = image.convert('RGB')
output = StringIO()
image.save(output, format=output_format)
return output
'''
try:
if transformations:
aspect = 1.0 * im.size[0] / im.size[1]
for transformation in transformations:
arguments = transformation.get('arguments')
if transformation['transformation'] == TRANSFORMATION_RESIZE:
width = int(arguments.get('width', 0))
height = int(arguments.get('height', 1.0 * width * aspect))
im = self.resize(im, (width, height))
elif transformation['transformation'] == TRANSFORMATION_ZOOM:
decimal_value = float(arguments.get('percent', 100)) / 100
im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
elif transformation['transformation'] == TRANSFORMATION_ROTATE:
# PIL counter degress counter-clockwise, reverse them
im = im.rotate(360 - arguments.get('degrees', 0))
except:
# Ignore all transformation error
pass
'''
# From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
def resize(self, img, box, fit=False, out=None):
"""

View File

@@ -19,7 +19,6 @@ from .literals import (
TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR
)
from .office_converter import OfficeConverter
from .runtime import backend, office_converter
from .settings import GRAPHICS_BACKEND, LIBREOFFICE_PATH
CONVERTER_OFFICE_FILE_MIMETYPES = [
@@ -68,7 +67,6 @@ CONVERTER_OFFICE_FILE_MIMETYPES = [
'text/plain',
'text/rtf',
]
logger = logging.getLogger(__name__)
@@ -109,23 +107,6 @@ class TransformationScale(BaseTransformation):
class Converter(object):
"""
def cache_cleanup(input_filepath, *args, **kwargs):
try:
os.remove(create_image_cache_filename(input_filepath, *args, **kwargs))
except OSError:
pass
"""
"""
def create_image_cache_filename(input_filepath, *args, **kwargs):
if input_filepath:
hash_value = HASH_FUNCTION(''.join([HASH_FUNCTION(smart_str(input_filepath)), unicode(args), unicode(kwargs)]))
return os.path.join(TEMPORARY_DIRECTORY, hash_value)
else:
return None
"""
@staticmethod
def soffice(file_object):
@@ -159,8 +140,7 @@ class Converter(object):
readline = proc.stderr.readline()
logger.debug('stderr: %s', readline)
if return_code != 0:
#raise OfficeBackendError(readline)
raise Exception(readline)
raise OfficeBackendError(readline)
filename, extension = os.path.splitext(os.path.basename(input_filepath))
logger.debug('filename: %s', filename)
@@ -169,106 +149,35 @@ class Converter(object):
converted_output = os.path.join(TEMPORARY_DIRECTORY, os.path.extsep.join([filename, 'pdf']))
logger.debug('converted_output: %s', converted_output)
return open(converted_output)
#os.rename(converted_output, output_filepath)
# TODO: remove temp file
return converted_output
def __init__(self, file_object, mime_type=None):
self.file_object = file_object
self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0]
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
if os.path.exists(LIBREOFFICE_PATH):
#file_object, filename = mkstemp()
# Cache results of conversion
#output_filepath = os.path.join(TEMPORARY_DIRECTORY, ''.join([self.input_filepath, CACHED_FILE_SUFFIX]))
result = Converter.soffice(file_object)
file_object.close()
self.file_object = result
self.mime_type = 'application/pdf'
#try:
# self.backend.convert(self.input_filepath, self.output_filepath)
# self.exists = True
#except OfficeBackendError as exception:
# # convert exception so that at least the mime type icon is displayed
# raise UnknownFileFormat(exception)
#else:
# result = office_converter.convert(self.file_object, mimetype=mime_type)
# self.file_object.close()
# self.file_object = result
else:
# TODO: NO LIBREOFFICE ERROR
pass
self.temporary_files = []
def transform(self, transformations, page=DEFAULT_PAGE_NUMBER):
pass
def convert(self, output_format=DEFAULT_FILE_FORMAT, page=DEFAULT_PAGE_NUMBER):#, *args, **kwargs):
#size = kwargs.get('size')
#file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
#zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
#rotation = kwargs.get('rotation', DEFAULT_ROTATION)
#page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
#transformations = kwargs.get('transformations', [])
def convert(self, output_format=DEFAULT_FILE_FORMAT, page=DEFAULT_PAGE_NUMBER):
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
if os.path.exists(LIBREOFFICE_PATH):
converted_output = Converter.soffice(self.file_object)
self.file_object.close()
self.file_object = open(converted_output)
self.mime_type = 'application/pdf'
self.temporary_file.append(converted_output)
else:
# TODO: NO LIBREOFFICE FOUND ERROR
pass
#if transformations is None:
# transformations = []
#if output_filepath is None:
# output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
#if os.path.exists(output_filepath):
# return output_filepath
'''
if office_converter:
try:
office_converter.convert(input_filepath, mimetype=mimetype)
if office_converter.exists:
input_filepath = office_converter.output_filepath
mimetype = 'application/pdf'
else:
# Recycle the already detected mimetype
mimetype = office_converter.mimetype
except OfficeConversionError:
raise UnknownFileFormat('office converter exception')
if size:
transformations.append(
{
'transformation': TRANSFORMATION_RESIZE,
'arguments': dict(zip(['width', 'height'], size.split(DIMENSION_SEPARATOR)))
}
)
if zoom != 100:
transformations.append(
{
'transformation': TRANSFORMATION_ZOOM,
'arguments': {'percent': zoom}
}
)
if rotation != 0 and rotation != 360:
transformations.append(
{
'transformation': TRANSFORMATION_ROTATE,
'arguments': {'degrees': rotation}
}
)
'''
for temporary_file in self.temporary_files:
fs_cleanup(temporary_file)
return backend.convert(file_object=self.file_object, mimetype=self.mime_type, output_format=output_format, page=page)
def get_page_count(self):
return backend.get_page_count(file_object)
def get_page_count(self):
return backend.get_page_count(file_object)
'''

View File

@@ -36,7 +36,7 @@ from .serializers import (
RecentDocumentSerializer
)
from .settings import DISPLAY_SIZE, ZOOM_MAX_LEVEL, ZOOM_MIN_LEVEL
from .tasks import task_get_document_image, task_new_document
from .tasks import task_get_document_page_image, task_new_document
class APIDocumentListView(generics.ListAPIView):
@@ -202,8 +202,10 @@ class APIDocumentImageView(generics.GenericAPIView):
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360
document_page = document.pages.get(page_number=page)
try:
task = task_get_document_image.apply_async(kwargs=dict(document_id=document.pk, size=size, page=page, zoom=zoom, rotation=rotation, as_base64=True, version=version), queue='converter')
task = task_get_document_page_image.apply_async(kwargs=dict(document_page_id=document_page.pk, size=size, zoom=zoom, rotation=rotation, as_base64=True, version=version), queue='converter')
return Response({
'status': 'success',
'data': task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT)

View File

@@ -17,6 +17,7 @@ from django.utils.translation import ugettext_lazy as _
from acls.utils import apply_default_acls
from common.settings import TEMPORARY_DIRECTORY
from common.utils import fs_cleanup
from converter.classes import Converter
from converter.exceptions import UnknownFileFormat
from converter.literals import (
@@ -526,7 +527,15 @@ class DocumentPage(models.Model):
def document(self):
return self.document_version.document
def get_image(self, *args, **kargs):
def get_uuid(self):
return 'page-cache-{}'.format(self.pk)
def get_cache_filename(self):
return os.path.join(CACHE_PATH, self.get_uuid())
def get_image(self, *args, **kwargs):
transformations = kwargs.pop('transformations', [])
#size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, as_base64=False, version=None):
#if zoom < ZOOM_MIN_LEVEL:
# zoom = ZOOM_MIN_LEVEL
@@ -538,20 +547,34 @@ class DocumentPage(models.Model):
#file_path = self.get_valid_image(size=size, page=page, zoom=zoom, rotation=rotation, version=version)
#logger.debug('file_path: %s', file_path)
as_base64 = kwargs.pop('as_base64', False)
converter = Converter(file_object=self.document_version.open())
data = converter.convert(page=self.page_number)
#print "data!!!!", data.getvalue()
##, *args, **kwargs):
return 'data:%s;base64,%s' % ('PNG', base64.b64encode(data.getvalue()))
cache_filename = self.get_cache_filename()
#if as_base64:
# with open(file_path, 'r') as file_object:
# #mimetype = get_mimetype(file_object=file_object, mimetype_only=True)[0]
# base64_data = base64.b64encode(file_object.read())
# return 'data:%s;base64,%s' % (mimetype, base64_data)
#else:
# return file_path
if os.path.exists(cache_filename) and 0:
with open(cache_filename) as file_object:
data = file_object.read()
if as_base64:
return 'data:%s;base64,%s' % ('image/png', base64.b64encode(data))
else:
return data
else:
try:
converter = Converter(file_object=self.document_version.open())
image_buffer = converter.convert(page=self.page_number, output_format='PNG')
with open(cache_filename, 'wb+') as file_object:
file_object.write(image_buffer.getvalue())
except:
fs_cleanup(cache_filename)
raise
else:
data = image_buffer.getvalue()
image_buffer.close()
if as_base64:
return 'data:%s;base64,%s' % ('image/png', base64.b64encode(data))
else:
return data
def argument_validator(value):

View File

@@ -9,16 +9,15 @@ from mayan.celery import app
from common.models import SharedUploadedFile
from .models import Document, DocumentType, DocumentVersion
from .models import Document, DocumentPage, DocumentType, DocumentVersion
logger = logging.getLogger(__name__)
@app.task(compression='zlib')
def task_get_document_image(document_id, *args, **kwargs):
document = Document.objects.get(pk=document_id)
first_page = document.latest_version.pages.first()
return first_page.get_image(*args, **kwargs)
def task_get_document_page_image(document_page_id, *args, **kwargs):
document_page = DocumentPage.objects.get(pk=document_page_id)
return document_page.get_image(*args, **kwargs)
@app.task(ignore_result=True)

View File

@@ -57,7 +57,8 @@ from .settings import (
ZOOM_MAX_LEVEL, ZOOM_MIN_LEVEL
)
from .tasks import (
task_clear_image_cache, task_get_document_image, task_update_page_count
task_clear_image_cache, task_get_document_page_image,
task_update_page_count
)
from .utils import parse_range
@@ -366,17 +367,15 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE):
rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360
task = task_get_document_image.apply_async(kwargs=dict(document_id=document.pk, size=size, page=page, zoom=zoom, rotation=rotation, as_base64=False, version=version), queue='converter')
document_page = document.pages.get(page_number=page)
task = task_get_document_page_image.apply_async(kwargs=dict(document_page_id=document_page.pk, size=size, zoom=zoom, rotation=rotation, as_base64=False, version=version), queue='converter')
data = task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT)
response = HttpResponse(data, content_type='data/PNG')
#response['Content-Disposition'] = 'attachment; filename="somefilename.pdf"'
response = HttpResponse(data, content_type='image')
return response
#print 'data!!!!!!!!!!!', task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT)
#re
# TODO: remove sendfile
#return sendfile.sendfile(request, task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT), mimetype=DEFAULT_FILE_FORMAT_MIMETYPE)

View File

@@ -24,7 +24,7 @@ class DocumentPageImageWidget(forms.widgets.Widget):
if value:
output = []
output.append('<div class="full-height scrollable mayan-page-wrapper-interactive" data-height-difference=230>')
output.append(document_html_widget(value.document, page=value.page_number, zoom=zoom, rotation=rotation, image_class='lazy-load-interactive', nolazyload=False, size=DISPLAY_SIZE))
output.append(document_html_widget(value, zoom=zoom, rotation=rotation, image_class='lazy-load-interactive', nolazyload=False, size=DISPLAY_SIZE))
output.append('</div>')
return mark_safe(''.join(output))
else:
@@ -46,21 +46,16 @@ class DocumentPagesCarouselWidget(forms.widgets.Widget):
document_pages = []
total_pages = 0
# Reuse expensive values
latest_version_pk = value.latest_version.pk
for page in document_pages:
output.append('<div class="carousel-item">')
output.append(
document_html_widget(
page.document,
page,
click_view='documents:document_page_view',
click_view_arguments=[page.pk],
page=page.page_number,
fancybox_class='',
image_class='lazy-load-carousel',
size=DISPLAY_SIZE,
version=latest_version_pk,
post_load_class='lazy-load-carousel-loaded',
)
)
@@ -73,29 +68,25 @@ class DocumentPagesCarouselWidget(forms.widgets.Widget):
def document_thumbnail(document, **kwargs):
return document_html_widget(document, click_view='documents:document_display', **kwargs)
return document_html_widget(document.latest_version.pages.first(), click_view='documents:document_display', **kwargs)
def document_link(document):
return mark_safe('<a href="%s">%s</a>' % (document.get_absolute_url(), document))
def document_html_widget(document, click_view=None, click_view_arguments=None, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, gallery_name=None, fancybox_class='fancybox', version=None, image_class='lazy-load', title=None, size=THUMBNAIL_SIZE, nolazyload=False, post_load_class=None):
def document_html_widget(document_page, click_view=None, click_view_arguments=None, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, gallery_name=None, fancybox_class='fancybox', image_class='lazy-load', title=None, size=THUMBNAIL_SIZE, nolazyload=False, post_load_class=None):
result = []
alt_text = _('Document page image')
if not version:
try:
version = document.latest_version.pk
except AttributeError:
version = None
document = document_page.document
page = document_page.page_number
query_dict = {
'page': page,
'zoom': zoom,
'rotation': rotation,
'version': version,
'size': size,
}
@@ -116,7 +107,12 @@ def document_html_widget(document, click_view=None, click_view_arguments=None, p
title_template = ''
if click_view:
result.append('<a %s class="%s" href="%s" %s>' % (gallery_template, fancybox_class, '%s?%s' % (reverse(click_view, args=click_view_arguments or [document.pk]), query_string), title_template))
result.append('<a {gallery_template} class="{fancybox_class}" href="{image_data}" {title_template}>'.format(
gallery_template=gallery_template,
fancybox_class=fancybox_class,
image_data='%s?%s' % (reverse(click_view, args=click_view_arguments or [document.pk]), query_string),
title_template=title_template
))
if nolazyload:
result.append('<img class="img-nolazyload" src="%s" alt="%s" />' % (preview_view, alt_text))