From b18888b3f746e67eba0ecd5c01f18f5beaff2e73 Mon Sep 17 00:00:00 2001
From: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
Date: Wed, 24 Jun 2015 01:04:35 -0400
Subject: [PATCH] Convert and cache office documents at the document version
 level for faster page image retrieval

---
 .../apps/converter/backends/graphicsmagick.py |  2 +-
 mayan/apps/converter/backends/imagemagick.py  |  2 +-
 mayan/apps/converter/backends/python.py       | 41 +++++------
 mayan/apps/converter/classes.py               | 57 ++++++++-------
 mayan/apps/converter/exceptions.py            |  9 +--
 mayan/apps/documents/models.py                | 71 ++++++++++++++++---
 mayan/apps/navigation/classes.py              |  3 +
 7 files changed, 116 insertions(+), 69 deletions(-)

diff --git a/mayan/apps/converter/backends/graphicsmagick.py b/mayan/apps/converter/backends/graphicsmagick.py
index b308ec16bb..374c8c121f 100644
--- a/mayan/apps/converter/backends/graphicsmagick.py
+++ b/mayan/apps/converter/backends/graphicsmagick.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import subprocess
 
 from ..classes import ConverterBase
-from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat
+from ..exceptions import ConvertError, UnknownFileFormat
 from ..literals import (
     TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
 )
diff --git a/mayan/apps/converter/backends/imagemagick.py b/mayan/apps/converter/backends/imagemagick.py
index d5b25f8131..cb4b772650 100644
--- a/mayan/apps/converter/backends/imagemagick.py
+++ b/mayan/apps/converter/backends/imagemagick.py
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
 import subprocess
 
 from ..classes import ConverterBase
-from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat
+from ..exceptions import ConvertError, UnknownFileFormat
 from ..literals import (
     DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER, DIMENSION_SEPARATOR,
     TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
diff --git a/mayan/apps/converter/backends/python.py b/mayan/apps/converter/backends/python.py
index fe98db0725..307d31cb19 100644
--- a/mayan/apps/converter/backends/python.py
+++ b/mayan/apps/converter/backends/python.py
@@ -39,12 +39,8 @@ class Python(ConverterBase):
 
             new_file_object, input_filepath = tempfile.mkstemp()
 
-            if self.soffice_file_object:
-                os.write(new_file_object, self.soffice_file_object.read())
-                self.soffice_file_object.close()
-            else:
-                os.write(new_file_object, self.file_object.read())
-                self.file_object.seek(0)
+            os.write(new_file_object, self.file_object.read())
+            self.file_object.seek(0)
 
             os.close(new_file_object)
 
@@ -57,6 +53,8 @@ class Python(ConverterBase):
                 fs_cleanup(input_filepath)
 
     def get_page_count(self):
+        super(Python, self).get_page_count()
+
         page_count = 1
 
         if self.mime_type == 'application/pdf':
@@ -64,25 +62,24 @@ class Python(ConverterBase):
             try:
                 pages = slate.PDF(self.file_object)
             except Exception as exception:
-                logger.error('slate exception; %s', exception)
-                return 1
-                # TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description
+                logger.error('Slate exception; %s', exception)
+                raise
             else:
                 return len(pages)
             finally:
                 self.file_object.seek(0)
+        else:
+            try:
+                image = Image.open(self.file_object)
+            finally:
+                self.file_object.seek(0)
 
-        try:
-            image = Image.open(self.file_object)
-        finally:
-            self.file_object.seek(0)
+            try:
+                while True:
+                    image.seek(image.tell() + 1)
+                    page_count += 1
+            except EOFError:
+                # end of sequence
+                pass
 
-        try:
-            while True:
-                image.seek(image.tell() + 1)
-                page_count += 1
-        except EOFError:
-            # end of sequence
-            pass
-
-        return page_count
+            return page_count
diff --git a/mayan/apps/converter/classes.py b/mayan/apps/converter/classes.py
index 67670798fa..f26a9f2d3b 100644
--- a/mayan/apps/converter/classes.py
+++ b/mayan/apps/converter/classes.py
@@ -18,11 +18,15 @@ from common.settings import setting_temporary_directory
 from common.utils import fs_cleanup
 from mimetype.api import get_mimetype
 
-from .exceptions import OfficeConversionError
+from .exceptions import InvalidOfficeFormat, OfficeConversionError
 from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT
 from .settings import setting_libreoffice_path
 
-CONVERTER_OFFICE_FILE_MIMETYPES = [
+CHUNK_SIZE = 1024
+logger = logging.getLogger(__name__)
+
+
+CONVERTER_OFFICE_FILE_MIMETYPES = (
     'application/msword',
     'application/mswrite',
     'application/mspowerpoint',
@@ -67,9 +71,7 @@ CONVERTER_OFFICE_FILE_MIMETYPES = [
     'text/x-shellscript',
     'text/plain',
     'text/rtf',
-]
-logger = logging.getLogger(__name__)
-
+)
 
 class ConverterBase(object):
     @staticmethod
@@ -78,11 +80,15 @@ class ConverterBase(object):
         Executes libreoffice using subprocess's Popen
         """
 
+        if not os.path.exists(setting_libreoffice_path.value):
+            raise OfficeConversionError(_('LibreOffice not installed or not found at path: %s') % setting_libreoffice_path.value)
+
         new_file_object, input_filepath = tempfile.mkstemp()
-        new_file_object.write(file_object.read())
         file_object.seek(0)
-        new_file_object.seek(0)
-        new_file_object.close()
+        os.write(new_file_object, file_object.read())
+        file_object.seek(0)
+        os.lseek(new_file_object, 0, os.SEEK_SET)
+        os.close(new_file_object)
 
         command = []
         command.append(setting_libreoffice_path.value)
@@ -100,9 +106,11 @@ class ConverterBase(object):
         proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
         return_code = proc.wait()
         logger.debug('return_code: %s', return_code)
+        fs_cleanup(input_filepath)
 
         readline = proc.stderr.readline()
         logger.debug('stderr: %s', readline)
+
         if return_code != 0:
             raise OfficeConversionError(readline)
 
@@ -113,7 +121,14 @@ class ConverterBase(object):
         converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join([filename, 'pdf']))
         logger.debug('converted_output: %s', converted_output)
 
-        return converted_output
+        with open(converted_output) as converted_file_object:
+            while True:
+                data = converted_file_object.read(CHUNK_SIZE)
+                if not data:
+                    break
+                yield data
+
+        fs_cleanup(input_filepath)
 
     def __init__(self, file_object, mime_type=None):
         self.file_object = file_object
@@ -121,6 +136,12 @@ class ConverterBase(object):
         self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0]
         self.soffice_file_object = None
 
+    def to_pdf(self):
+        if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
+            return ConverterBase.soffice(self.file_object)
+        else:
+            raise InvalidOfficeFormat(_('Not an office file format.'))
+
     def seek(self, page_number):
         # Starting with #0
         self.file_object.seek(0)
@@ -147,22 +168,6 @@ class ConverterBase(object):
     def convert(self, page_number=DEFAULT_PAGE_NUMBER):
         self.page_number = page_number
 
-        self.mime_type = 'application/pdf'
-
-        if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
-            if os.path.exists(setting_libreoffice_path.value):
-                if not self.soffice_file_object:
-                    converted_output = ConverterBase.soffice(self.file_object)
-                    self.file_object.seek(0)
-                    self.soffice_file_object = open(converted_output)
-                    self.mime_type = 'application/pdf'
-                    fs_cleanup(converted_output)
-                else:
-                    self.soffice_file_object.seek(0)
-            else:
-                # TODO: NO LIBREOFFICE FOUND ERROR
-                pass
-
     def transform(self, transformation):
         if not self.image:
             self.seek(0)
@@ -177,7 +182,7 @@ class ConverterBase(object):
             self.image = transformation.execute_on(self.image)
 
     def get_page_count(self):
-        raise NotImplementedError()
+        raise NotImplementedError
 
 
 class BaseTransformation(object):
diff --git a/mayan/apps/converter/exceptions.py b/mayan/apps/converter/exceptions.py
index 832fe5e1be..617643b40e 100644
--- a/mayan/apps/converter/exceptions.py
+++ b/mayan/apps/converter/exceptions.py
@@ -15,13 +15,6 @@ class UnknownFileFormat(ConvertError):
     pass
 
 
-class IdentifyError(ConvertError):
-    """
-    Raised by the graphcismagick and imagemagics identify program
-    """
-    pass
-
-
 class UnkownConvertError(ConvertError):
     """
     Raised when an error is found but there is no disernible way to
@@ -34,5 +27,5 @@ class OfficeConversionError(ConvertError):
     pass
 
 
-class OfficeBackendError(OfficeConversionError):
+class InvalidOfficeFormat(ConvertError):
     pass
diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py
index bca0fb45bc..f3837b3c17 100644
--- a/mayan/apps/documents/models.py
+++ b/mayan/apps/documents/models.py
@@ -16,9 +16,10 @@ from acls.utils import apply_default_acls
 from common.settings import setting_temporary_directory
 from common.utils import fs_cleanup
 from converter import (
-    converter_class, TransformationResize, TransformationRotate, TransformationZoom
+    converter_class, TransformationResize, TransformationRotate,
+    TransformationZoom
 )
-from converter.exceptions import UnknownFileFormat
+from converter.exceptions import InvalidOfficeFormat, UnknownFileFormat
 from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION
 from converter.models import Transformation
 from mimetype.api import get_mimetype
@@ -395,6 +396,45 @@ class DocumentVersion(models.Model):
     def page_count(self):
         return self.pages.count()
 
+    @property
+    def uuid(self):
+        # Make cache UUID a mix of document UUID, version ID
+        return '{}-{}'.format(self.document.uuid, self.pk)
+
+    @property
+    def cache_filename(self):
+        return os.path.join(setting_cache_path.value, 'document-version-{}'.format(self.uuid))
+
+    def get_intermidiate_file(self):
+        cache_filename = self.cache_filename
+        logger.debug('Intermidiate filename: %s', cache_filename)
+
+        if os.path.exists(cache_filename):
+            logger.debug('Intermidiate file "%s" found.', cache_filename)
+
+            return open(cache_filename)
+            #converter = converter_class(file_object=open(cache_filename))
+            #converter.seek(0)
+        else:
+            logger.debug('Intermidiate file "%s" not found.', cache_filename)
+
+            try:
+                converter = converter_class(file_object=self.open())
+                pdf_file_object = converter.to_pdf()
+
+                with open(cache_filename, 'wb+') as file_object:
+                    for chunk in pdf_file_object:
+                        file_object.write(chunk)
+
+                return open(cache_filename)
+            except InvalidOfficeFormat:
+                return self.open()
+            except Exception as exception:
+                # Cleanup in case of error
+                logger.error('Error creating intermediate file "%s"; %s.', cache_filename, exception)
+                fs_cleanup(cache_filename)
+                raise
+
 
 @python_2_unicode_compatible
 class DocumentTypeFilename(models.Model):
@@ -455,13 +495,17 @@ class DocumentPage(models.Model):
     def invalidate_cache(self):
         fs_cleanup(self.get_cache_filename())
 
-    def get_uuid(self):
-        # Make cache UUID a mix of document UUID, version ID and page ID to
-        # avoid using stale images
-        return 'page-cache-{}-{}-{}'.format(self.document.uuid, self.document_version.pk, self.pk)
+    @property
+    def uuid(self):
+        """
+        Make cache UUID a mix of version ID and page ID to avoid using stale
+        images
+        """
+        return '{}-{}'.format(self.document_version.uuid, self.pk)
 
-    def get_cache_filename(self):
-        return os.path.join(setting_cache_path.value, self.get_uuid())
+    @property
+    def cache_filename(self):
+        return os.path.join(setting_cache_path.value, 'page-cache-{}'.format(self.uuid))
 
     def get_image(self, *args, **kwargs):
         as_base64 = kwargs.pop('as_base64', False)
@@ -478,22 +522,27 @@ class DocumentPage(models.Model):
 
         rotation = rotation % 360
 
-        cache_filename = self.get_cache_filename()
+        cache_filename = self.cache_filename
+        logger.debug('Page cache filename: %s', cache_filename)
 
         if os.path.exists(cache_filename):
+            logger.debug('Page cache file "%s" found', cache_filename)
             converter = converter_class(file_object=open(cache_filename))
 
             converter.seek(0)
         else:
+            logger.debug('Page cache file "%s" not found', cache_filename)
+
             try:
-                converter = converter_class(file_object=self.document_version.open())
+                converter = converter_class(file_object=self.document_version.get_intermidiate_file())
                 converter.seek(page_number=self.page_number - 1)
 
                 page_image = converter.get_page()
                 with open(cache_filename, 'wb+') as file_object:
                     file_object.write(page_image.getvalue())
-            except:
+            except Exception as exception:
                 # Cleanup in case of error
+                logger.error('Error creating page cache file "%s".', cache_filename)
                 fs_cleanup(cache_filename)
                 raise
 
diff --git a/mayan/apps/navigation/classes.py b/mayan/apps/navigation/classes.py
index 0b3b80a94a..df62d56a03 100644
--- a/mayan/apps/navigation/classes.py
+++ b/mayan/apps/navigation/classes.py
@@ -263,6 +263,9 @@ class SourceColumn(object):
                     return cls._registry[source.__class__]
                 except KeyError:
                     return ()
+        except TypeError:
+            # unhashable type: list
+            return ()
 
     def __init__(self, source, label, attribute):
         self.__class__._registry.setdefault(source, [])