Further converter refactor and initial move from document-centric to page-centric image generation. Issue #93.

2015-06-06 06:26:44 -04:00
parent de6182aea0
commit f4752a3f3f
7 changed files with 105 additions and 207 deletions
--- a/mayan/apps/converter/backends/python.py
+++ b/mayan/apps/converter/backends/python.py
@@ -40,21 +40,17 @@ class Python(ConverterBase):
    def get_page_count(self, file_object, mimetype=None):
        page_count = 1

-        #file_object, input_filepath = mkstemp()
-        #file_object.write(input_data)
-
        if not mimetype:
-            #mimetype, encoding = get_mimetype(file_description=open(input_filepath, 'rb'), filepath=None, mimetype_only=True)
            mimetype, encoding = get_mimetype(file_object=file_object, mimetype_only=True)
        else:
            encoding = None

        if mimetype == 'application/pdf':
            # If file is a PDF open it with slate to determine the page count
-            #with open(input_filepath) as fd:
            try:
                pages = slate.PDF(file_object)
-            except:
+            except Exception as exception:
+                logger.error('slate exception; %s', exception)
                return 1
                # TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description
            else:
@@ -63,7 +59,6 @@ class Python(ConverterBase):
                file_object.seek(0)

        try:
-            #im = Image.fromarray(input_data)
            image = Image.open(file_object)
        except IOError:  # cannot identify image file
            raise UnknownFileFormat
@@ -81,47 +76,23 @@ class Python(ConverterBase):
        return page_count

    def convert(self, file_object, mimetype=None, output_format=DEFAULT_FILE_FORMAT, page=DEFAULT_PAGE_NUMBER):
-
-        #tmpfile = None
-        #mimetype = kwargs.get('mimetype', None)
-
        if not mimetype:
            mimetype, encoding = get_mimetype(file_object=file_object, mimetype_only=True)

-        ##try:
-        print "MIME!", mimetype
        if mimetype == 'application/pdf' and pdftoppm:
            image_buffer = io.BytesIO()

            new_file_object, input_filepath = tempfile.mkstemp()
            os.write(new_file_object, file_object.read())
-            #file_object.seek(0)
-            #new_file_object.seek(0)
            os.close(new_file_object)

-
-
            pdftoppm(input_filepath, f=page, l=page, _out=image_buffer)
            image_buffer.seek(0)
            image = Image.open(image_buffer)
-            # TODO: remove input_filepath
+            fs_cleanup(input_filepath)
        else:
            image = Image.open(file_object)

-
-
-        ##except Exception as exception:
-        ##    logger.error('Error converting image; %s', exception)
-        ##    # Python Imaging Library doesn't recognize it as an image
-        ##    raise ConvertError
-        ##except IOError:  # cannot identify image file
-        ##    raise UnknownFileFormat
-
-
-        #finally:
-        #    if tmpfile:
-        #        fs_cleanup(tmpfile)
-
        current_page = 0
        try:
            while current_page == page - 1:
@@ -132,36 +103,35 @@ class Python(ConverterBase):
            # end of sequence
            pass

-        '''
-        try:
-            if transformations:
-                aspect = 1.0 * im.size[0] / im.size[1]
-                for transformation in transformations:
-                    arguments = transformation.get('arguments')
-                    if transformation['transformation'] == TRANSFORMATION_RESIZE:
-                        width = int(arguments.get('width', 0))
-                        height = int(arguments.get('height', 1.0 * width * aspect))
-                        im = self.resize(im, (width, height))
-                    elif transformation['transformation'] == TRANSFORMATION_ZOOM:
-                        decimal_value = float(arguments.get('percent', 100)) / 100
-                        im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
-                    elif transformation['transformation'] == TRANSFORMATION_ROTATE:
-                        # PIL counter degress counter-clockwise, reverse them
-                        im = im.rotate(360 - arguments.get('degrees', 0))
-        except:
-            # Ignore all transformation error
-            pass
-        '''
-
        if image.mode not in ('L', 'RGB'):
            image = image.convert('RGB')

-
        output = StringIO()
        image.save(output, format=output_format)

        return output

+    '''
+    try:
+        if transformations:
+            aspect = 1.0 * im.size[0] / im.size[1]
+            for transformation in transformations:
+                arguments = transformation.get('arguments')
+                if transformation['transformation'] == TRANSFORMATION_RESIZE:
+                    width = int(arguments.get('width', 0))
+                    height = int(arguments.get('height', 1.0 * width * aspect))
+                    im = self.resize(im, (width, height))
+                elif transformation['transformation'] == TRANSFORMATION_ZOOM:
+                    decimal_value = float(arguments.get('percent', 100)) / 100
+                    im = im.transform((int(im.size[0] * decimal_value), int(im.size[1] * decimal_value)), Image.EXTENT, (0, 0, im.size[0], im.size[1]))
+                elif transformation['transformation'] == TRANSFORMATION_ROTATE:
+                    # PIL counter degress counter-clockwise, reverse them
+                    im = im.rotate(360 - arguments.get('degrees', 0))
+    except:
+        # Ignore all transformation error
+        pass
+    '''
+
    # From: http://united-coders.com/christian-harms/image-resizing-tips-general-and-for-python
    def resize(self, img, box, fit=False, out=None):
        """
--- a/mayan/apps/converter/classes.py
+++ b/mayan/apps/converter/classes.py
@@ -19,7 +19,6 @@ from .literals import (
    TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM, DIMENSION_SEPARATOR
 )
 from .office_converter import OfficeConverter
-from .runtime import backend, office_converter
 from .settings import GRAPHICS_BACKEND, LIBREOFFICE_PATH

 CONVERTER_OFFICE_FILE_MIMETYPES = [
@@ -68,7 +67,6 @@ CONVERTER_OFFICE_FILE_MIMETYPES = [
    'text/plain',
    'text/rtf',
 ]
-
 logger = logging.getLogger(__name__)


@@ -109,23 +107,6 @@ class TransformationScale(BaseTransformation):


 class Converter(object):
-    """
-    def cache_cleanup(input_filepath, *args, **kwargs):
-        try:
-            os.remove(create_image_cache_filename(input_filepath, *args, **kwargs))
-        except OSError:
-            pass
-    """
-
-    """
-    def create_image_cache_filename(input_filepath, *args, **kwargs):
-        if input_filepath:
-            hash_value = HASH_FUNCTION(''.join([HASH_FUNCTION(smart_str(input_filepath)), unicode(args), unicode(kwargs)]))
-            return os.path.join(TEMPORARY_DIRECTORY, hash_value)
-        else:
-            return None
-    """
-

    @staticmethod
    def soffice(file_object):
@@ -159,8 +140,7 @@ class Converter(object):
        readline = proc.stderr.readline()
        logger.debug('stderr: %s', readline)
        if return_code != 0:
-            #raise OfficeBackendError(readline)
-            raise Exception(readline)
+            raise OfficeBackendError(readline)

        filename, extension = os.path.splitext(os.path.basename(input_filepath))
        logger.debug('filename: %s', filename)
@@ -169,106 +149,35 @@ class Converter(object):
        converted_output = os.path.join(TEMPORARY_DIRECTORY, os.path.extsep.join([filename, 'pdf']))
        logger.debug('converted_output: %s', converted_output)

-        return open(converted_output)
-        #os.rename(converted_output, output_filepath)
-        # TODO: remove temp file
-
+        return converted_output

    def __init__(self, file_object, mime_type=None):
        self.file_object = file_object
        self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0]
-
-        if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
-            if os.path.exists(LIBREOFFICE_PATH):
-                #file_object, filename = mkstemp()
-
-                # Cache results of conversion
-                #output_filepath = os.path.join(TEMPORARY_DIRECTORY, ''.join([self.input_filepath, CACHED_FILE_SUFFIX]))
-
-                result = Converter.soffice(file_object)
-                file_object.close()
-                self.file_object = result
-                self.mime_type = 'application/pdf'
-
-                #try:
-                #    self.backend.convert(self.input_filepath, self.output_filepath)
-                #    self.exists = True
-                #except OfficeBackendError as exception:
-                #    # convert exception so that at least the mime type icon is displayed
-                #    raise UnknownFileFormat(exception)
-                #else:
-                #    result = office_converter.convert(self.file_object, mimetype=mime_type)
-                #    self.file_object.close()
-                #    self.file_object = result
-            else:
-                # TODO: NO LIBREOFFICE ERROR
-                pass
-
+        self.temporary_files = []

    def transform(self, transformations, page=DEFAULT_PAGE_NUMBER):
        pass

-    def convert(self, output_format=DEFAULT_FILE_FORMAT, page=DEFAULT_PAGE_NUMBER):#, *args, **kwargs):
-        #size = kwargs.get('size')
-        #file_format = kwargs.get('file_format', DEFAULT_FILE_FORMAT)
-        #zoom = kwargs.get('zoom', DEFAULT_ZOOM_LEVEL)
-        #rotation = kwargs.get('rotation', DEFAULT_ROTATION)
-        #page = kwargs.get('page', DEFAULT_PAGE_NUMBER)
-        #transformations = kwargs.get('transformations', [])
+    def convert(self, output_format=DEFAULT_FILE_FORMAT, page=DEFAULT_PAGE_NUMBER):
+        if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
+            if os.path.exists(LIBREOFFICE_PATH):
+                converted_output = Converter.soffice(self.file_object)
+                self.file_object.close()
+                self.file_object = open(converted_output)
+                self.mime_type = 'application/pdf'
+                self.temporary_file.append(converted_output)
+            else:
+                # TODO: NO LIBREOFFICE FOUND ERROR
+                pass

-        #if transformations is None:
-        #    transformations = []
-
-        #if output_filepath is None:
-        #    output_filepath = create_image_cache_filename(input_filepath, *args, **kwargs)
-
-        #if os.path.exists(output_filepath):
-        #    return output_filepath
-
-        '''
-        if office_converter:
-            try:
-                office_converter.convert(input_filepath, mimetype=mimetype)
-                if office_converter.exists:
-                    input_filepath = office_converter.output_filepath
-                    mimetype = 'application/pdf'
-                else:
-                    # Recycle the already detected mimetype
-                    mimetype = office_converter.mimetype
-
-            except OfficeConversionError:
-                raise UnknownFileFormat('office converter exception')
-
-        if size:
-            transformations.append(
-                {
-                    'transformation': TRANSFORMATION_RESIZE,
-                    'arguments': dict(zip(['width', 'height'], size.split(DIMENSION_SEPARATOR)))
-                }
-            )
-
-        if zoom != 100:
-            transformations.append(
-                {
-                    'transformation': TRANSFORMATION_ZOOM,
-                    'arguments': {'percent': zoom}
-                }
-            )
-
-        if rotation != 0 and rotation != 360:
-            transformations.append(
-                {
-                    'transformation': TRANSFORMATION_ROTATE,
-                    'arguments': {'degrees': rotation}
-                }
-            )
-        '''
+        for temporary_file in self.temporary_files:
+            fs_cleanup(temporary_file)

        return backend.convert(file_object=self.file_object, mimetype=self.mime_type, output_format=output_format, page=page)

-        def get_page_count(self):
-            return backend.get_page_count(file_object)
-
+    def get_page_count(self):
+        return backend.get_page_count(file_object)


 '''
--- a/mayan/apps/documents/api_views.py
+++ b/mayan/apps/documents/api_views.py
@@ -36,7 +36,7 @@ from .serializers import (
    RecentDocumentSerializer
 )
 from .settings import DISPLAY_SIZE, ZOOM_MAX_LEVEL, ZOOM_MIN_LEVEL
-from .tasks import task_get_document_image, task_new_document
+from .tasks import task_get_document_page_image, task_new_document


 class APIDocumentListView(generics.ListAPIView):
@@ -202,8 +202,10 @@ class APIDocumentImageView(generics.GenericAPIView):

        rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360

+        document_page = document.pages.get(page_number=page)
+
        try:
-            task = task_get_document_image.apply_async(kwargs=dict(document_id=document.pk, size=size, page=page, zoom=zoom, rotation=rotation, as_base64=True, version=version), queue='converter')
+            task = task_get_document_page_image.apply_async(kwargs=dict(document_page_id=document_page.pk, size=size, zoom=zoom, rotation=rotation, as_base64=True, version=version), queue='converter')
            return Response({
                'status': 'success',
                'data': task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT)
--- a/mayan/apps/documents/models.py
+++ b/mayan/apps/documents/models.py
@@ -17,6 +17,7 @@ from django.utils.translation import ugettext_lazy as _

 from acls.utils import apply_default_acls
 from common.settings import TEMPORARY_DIRECTORY
+from common.utils import fs_cleanup
 from converter.classes import Converter
 from converter.exceptions import UnknownFileFormat
 from converter.literals import (
@@ -526,7 +527,15 @@ class DocumentPage(models.Model):
    def document(self):
        return self.document_version.document

-    def get_image(self, *args, **kargs):
+    def get_uuid(self):
+        return 'page-cache-{}'.format(self.pk)
+
+    def get_cache_filename(self):
+        return os.path.join(CACHE_PATH, self.get_uuid())
+
+    def get_image(self, *args, **kwargs):
+        transformations = kwargs.pop('transformations', [])
+
        #size=DISPLAY_SIZE, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, as_base64=False, version=None):
        #if zoom < ZOOM_MIN_LEVEL:
        #    zoom = ZOOM_MIN_LEVEL
@@ -538,20 +547,34 @@ class DocumentPage(models.Model):

        #file_path = self.get_valid_image(size=size, page=page, zoom=zoom, rotation=rotation, version=version)
        #logger.debug('file_path: %s', file_path)
+        as_base64 = kwargs.pop('as_base64', False)

-        converter = Converter(file_object=self.document_version.open())
-        data = converter.convert(page=self.page_number)
-        #print "data!!!!", data.getvalue()
-        ##, *args, **kwargs):
-        return 'data:%s;base64,%s' % ('PNG', base64.b64encode(data.getvalue()))
+        cache_filename = self.get_cache_filename()

-        #if as_base64:
-        #    with open(file_path, 'r') as file_object:
-        #        #mimetype = get_mimetype(file_object=file_object, mimetype_only=True)[0]
-        #        base64_data = base64.b64encode(file_object.read())
-        #        return 'data:%s;base64,%s' % (mimetype, base64_data)
-        #else:
-        #    return file_path
+        if os.path.exists(cache_filename) and 0:
+            with open(cache_filename) as file_object:
+                data = file_object.read()
+
+            if as_base64:
+                return 'data:%s;base64,%s' % ('image/png', base64.b64encode(data))
+            else:
+                return data
+        else:
+            try:
+                converter = Converter(file_object=self.document_version.open())
+                image_buffer = converter.convert(page=self.page_number, output_format='PNG')
+                with open(cache_filename, 'wb+') as file_object:
+                    file_object.write(image_buffer.getvalue())
+            except:
+                fs_cleanup(cache_filename)
+                raise
+            else:
+                data = image_buffer.getvalue()
+                image_buffer.close()
+                if as_base64:
+                    return 'data:%s;base64,%s' % ('image/png', base64.b64encode(data))
+                else:
+                    return data


 def argument_validator(value):
--- a/mayan/apps/documents/tasks.py
+++ b/mayan/apps/documents/tasks.py
@@ -9,16 +9,15 @@ from mayan.celery import app

 from common.models import SharedUploadedFile

-from .models import Document, DocumentType, DocumentVersion
+from .models import Document, DocumentPage, DocumentType, DocumentVersion

 logger = logging.getLogger(__name__)


@app.task(compression='zlib')
-def task_get_document_image(document_id, *args, **kwargs):
-    document = Document.objects.get(pk=document_id)
-    first_page = document.latest_version.pages.first()
-    return first_page.get_image(*args, **kwargs)
+def task_get_document_page_image(document_page_id, *args, **kwargs):
+    document_page = DocumentPage.objects.get(pk=document_page_id)
+    return document_page.get_image(*args, **kwargs)


@app.task(ignore_result=True)
--- a/mayan/apps/documents/views.py
+++ b/mayan/apps/documents/views.py
@@ -57,7 +57,8 @@ from .settings import (
    ZOOM_MAX_LEVEL, ZOOM_MIN_LEVEL
 )
 from .tasks import (
-    task_clear_image_cache, task_get_document_image, task_update_page_count
+    task_clear_image_cache, task_get_document_page_image,
+    task_update_page_count
 )
 from .utils import parse_range

@@ -366,17 +367,15 @@ def get_document_image(request, document_id, size=PREVIEW_SIZE):

    rotation = int(request.GET.get('rotation', DEFAULT_ROTATION)) % 360

-    task = task_get_document_image.apply_async(kwargs=dict(document_id=document.pk, size=size, page=page, zoom=zoom, rotation=rotation, as_base64=False, version=version), queue='converter')
+    document_page = document.pages.get(page_number=page)
+
+    task = task_get_document_page_image.apply_async(kwargs=dict(document_page_id=document_page.pk, size=size, zoom=zoom, rotation=rotation, as_base64=False, version=version), queue='converter')
    data = task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT)

-    response = HttpResponse(data, content_type='data/PNG')
-    #response['Content-Disposition'] = 'attachment; filename="somefilename.pdf"'
-
+    response = HttpResponse(data, content_type='image')
    return response

-    #print 'data!!!!!!!!!!!', task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT)
-    #re
-
+    # TODO: remove sendfile
    #return sendfile.sendfile(request, task.get(timeout=DOCUMENT_IMAGE_TASK_TIMEOUT), mimetype=DEFAULT_FILE_FORMAT_MIMETYPE)


--- a/mayan/apps/documents/widgets.py
+++ b/mayan/apps/documents/widgets.py
@@ -24,7 +24,7 @@ class DocumentPageImageWidget(forms.widgets.Widget):
        if value:
            output = []
            output.append('<div class="full-height scrollable mayan-page-wrapper-interactive" data-height-difference=230>')
-            output.append(document_html_widget(value.document, page=value.page_number, zoom=zoom, rotation=rotation, image_class='lazy-load-interactive', nolazyload=False, size=DISPLAY_SIZE))
+            output.append(document_html_widget(value, zoom=zoom, rotation=rotation, image_class='lazy-load-interactive', nolazyload=False, size=DISPLAY_SIZE))
            output.append('</div>')
            return mark_safe(''.join(output))
        else:
@@ -46,21 +46,16 @@ class DocumentPagesCarouselWidget(forms.widgets.Widget):
            document_pages = []
            total_pages = 0

-        # Reuse expensive values
-        latest_version_pk = value.latest_version.pk
-
        for page in document_pages:
            output.append('<div class="carousel-item">')
            output.append(
                document_html_widget(
-                    page.document,
+                    page,
                    click_view='documents:document_page_view',
                    click_view_arguments=[page.pk],
-                    page=page.page_number,
                    fancybox_class='',
                    image_class='lazy-load-carousel',
                    size=DISPLAY_SIZE,
-                    version=latest_version_pk,
                    post_load_class='lazy-load-carousel-loaded',
                )
            )
@@ -73,29 +68,25 @@ class DocumentPagesCarouselWidget(forms.widgets.Widget):


 def document_thumbnail(document, **kwargs):
-    return document_html_widget(document, click_view='documents:document_display', **kwargs)
+    return document_html_widget(document.latest_version.pages.first(), click_view='documents:document_display', **kwargs)


 def document_link(document):
    return mark_safe('<a href="%s">%s</a>' % (document.get_absolute_url(), document))


-def document_html_widget(document, click_view=None, click_view_arguments=None, page=DEFAULT_PAGE_NUMBER, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, gallery_name=None, fancybox_class='fancybox', version=None, image_class='lazy-load', title=None, size=THUMBNAIL_SIZE, nolazyload=False, post_load_class=None):
+def document_html_widget(document_page, click_view=None, click_view_arguments=None, zoom=DEFAULT_ZOOM_LEVEL, rotation=DEFAULT_ROTATION, gallery_name=None, fancybox_class='fancybox', image_class='lazy-load', title=None, size=THUMBNAIL_SIZE, nolazyload=False, post_load_class=None):
    result = []

    alt_text = _('Document page image')

-    if not version:
-        try:
-            version = document.latest_version.pk
-        except AttributeError:
-            version = None
+    document = document_page.document
+    page = document_page.page_number

    query_dict = {
        'page': page,
        'zoom': zoom,
        'rotation': rotation,
-        'version': version,
        'size': size,
    }

@@ -116,7 +107,12 @@ def document_html_widget(document, click_view=None, click_view_arguments=None, p
        title_template = ''

    if click_view:
-        result.append('<a %s class="%s" href="%s" %s>' % (gallery_template, fancybox_class, '%s?%s' % (reverse(click_view, args=click_view_arguments or [document.pk]), query_string), title_template))
+        result.append('<a {gallery_template} class="{fancybox_class}" href="{image_data}" {title_template}>'.format(
+            gallery_template=gallery_template,
+            fancybox_class=fancybox_class,
+            image_data='%s?%s' % (reverse(click_view, args=click_view_arguments or [document.pk]), query_string),
+            title_template=title_template
+        ))

    if nolazyload:
        result.append('<img class="img-nolazyload" src="%s" alt="%s" />' % (preview_view, alt_text))