From b18888b3f746e67eba0ecd5c01f18f5beaff2e73 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Wed, 24 Jun 2015 01:04:35 -0400 Subject: [PATCH] Convert and cache office documents at the document version level for faster page image retrieval --- .../apps/converter/backends/graphicsmagick.py | 2 +- mayan/apps/converter/backends/imagemagick.py | 2 +- mayan/apps/converter/backends/python.py | 41 +++++------ mayan/apps/converter/classes.py | 57 ++++++++------- mayan/apps/converter/exceptions.py | 9 +-- mayan/apps/documents/models.py | 71 ++++++++++++++++--- mayan/apps/navigation/classes.py | 3 + 7 files changed, 116 insertions(+), 69 deletions(-) diff --git a/mayan/apps/converter/backends/graphicsmagick.py b/mayan/apps/converter/backends/graphicsmagick.py index b308ec16bb..374c8c121f 100644 --- a/mayan/apps/converter/backends/graphicsmagick.py +++ b/mayan/apps/converter/backends/graphicsmagick.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import subprocess from ..classes import ConverterBase -from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat +from ..exceptions import ConvertError, UnknownFileFormat from ..literals import ( TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM ) diff --git a/mayan/apps/converter/backends/imagemagick.py b/mayan/apps/converter/backends/imagemagick.py index d5b25f8131..cb4b772650 100644 --- a/mayan/apps/converter/backends/imagemagick.py +++ b/mayan/apps/converter/backends/imagemagick.py @@ -3,7 +3,7 @@ from __future__ import unicode_literals import subprocess from ..classes import ConverterBase -from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat +from ..exceptions import ConvertError, UnknownFileFormat from ..literals import ( DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER, DIMENSION_SEPARATOR, TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM diff --git a/mayan/apps/converter/backends/python.py b/mayan/apps/converter/backends/python.py index fe98db0725..307d31cb19 100644 --- a/mayan/apps/converter/backends/python.py +++ b/mayan/apps/converter/backends/python.py @@ -39,12 +39,8 @@ class Python(ConverterBase): new_file_object, input_filepath = tempfile.mkstemp() - if self.soffice_file_object: - os.write(new_file_object, self.soffice_file_object.read()) - self.soffice_file_object.close() - else: - os.write(new_file_object, self.file_object.read()) - self.file_object.seek(0) + os.write(new_file_object, self.file_object.read()) + self.file_object.seek(0) os.close(new_file_object) @@ -57,6 +53,8 @@ class Python(ConverterBase): fs_cleanup(input_filepath) def get_page_count(self): + super(Python, self).get_page_count() + page_count = 1 if self.mime_type == 'application/pdf': @@ -64,25 +62,24 @@ class Python(ConverterBase): try: pages = slate.PDF(self.file_object) except Exception as exception: - logger.error('slate exception; %s', exception) - return 1 - # TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description + logger.error('Slate exception; %s', exception) + raise else: return len(pages) finally: self.file_object.seek(0) + else: + try: + image = Image.open(self.file_object) + finally: + self.file_object.seek(0) - try: - image = Image.open(self.file_object) - finally: - self.file_object.seek(0) + try: + while True: + image.seek(image.tell() + 1) + page_count += 1 + except EOFError: + # end of sequence + pass - try: - while True: - image.seek(image.tell() + 1) - page_count += 1 - except EOFError: - # end of sequence - pass - - return page_count + return page_count diff --git a/mayan/apps/converter/classes.py b/mayan/apps/converter/classes.py index 67670798fa..f26a9f2d3b 100644 --- a/mayan/apps/converter/classes.py +++ b/mayan/apps/converter/classes.py @@ -18,11 +18,15 @@ from common.settings import setting_temporary_directory from common.utils import fs_cleanup from mimetype.api import get_mimetype -from .exceptions import OfficeConversionError +from .exceptions import InvalidOfficeFormat, OfficeConversionError from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT from .settings import setting_libreoffice_path -CONVERTER_OFFICE_FILE_MIMETYPES = [ +CHUNK_SIZE = 1024 +logger = logging.getLogger(__name__) + + +CONVERTER_OFFICE_FILE_MIMETYPES = ( 'application/msword', 'application/mswrite', 'application/mspowerpoint', @@ -67,9 +71,7 @@ CONVERTER_OFFICE_FILE_MIMETYPES = [ 'text/x-shellscript', 'text/plain', 'text/rtf', -] -logger = logging.getLogger(__name__) - +) class ConverterBase(object): @staticmethod @@ -78,11 +80,15 @@ class ConverterBase(object): Executes libreoffice using subprocess's Popen """ + if not os.path.exists(setting_libreoffice_path.value): + raise OfficeConversionError(_('LibreOffice not installed or not found at path: %s') % setting_libreoffice_path.value) + new_file_object, input_filepath = tempfile.mkstemp() - new_file_object.write(file_object.read()) file_object.seek(0) - new_file_object.seek(0) - new_file_object.close() + os.write(new_file_object, file_object.read()) + file_object.seek(0) + os.lseek(new_file_object, 0, os.SEEK_SET) + os.close(new_file_object) command = [] command.append(setting_libreoffice_path.value) @@ -100,9 +106,11 @@ class ConverterBase(object): proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() logger.debug('return_code: %s', return_code) + fs_cleanup(input_filepath) readline = proc.stderr.readline() logger.debug('stderr: %s', readline) + if return_code != 0: raise OfficeConversionError(readline) @@ -113,7 +121,14 @@ class ConverterBase(object): converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join([filename, 'pdf'])) logger.debug('converted_output: %s', converted_output) - return converted_output + with open(converted_output) as converted_file_object: + while True: + data = converted_file_object.read(CHUNK_SIZE) + if not data: + break + yield data + + fs_cleanup(input_filepath) def __init__(self, file_object, mime_type=None): self.file_object = file_object @@ -121,6 +136,12 @@ class ConverterBase(object): self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0] self.soffice_file_object = None + def to_pdf(self): + if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES: + return ConverterBase.soffice(self.file_object) + else: + raise InvalidOfficeFormat(_('Not an office file format.')) + def seek(self, page_number): # Starting with #0 self.file_object.seek(0) @@ -147,22 +168,6 @@ class ConverterBase(object): def convert(self, page_number=DEFAULT_PAGE_NUMBER): self.page_number = page_number - self.mime_type = 'application/pdf' - - if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES: - if os.path.exists(setting_libreoffice_path.value): - if not self.soffice_file_object: - converted_output = ConverterBase.soffice(self.file_object) - self.file_object.seek(0) - self.soffice_file_object = open(converted_output) - self.mime_type = 'application/pdf' - fs_cleanup(converted_output) - else: - self.soffice_file_object.seek(0) - else: - # TODO: NO LIBREOFFICE FOUND ERROR - pass - def transform(self, transformation): if not self.image: self.seek(0) @@ -177,7 +182,7 @@ class ConverterBase(object): self.image = transformation.execute_on(self.image) def get_page_count(self): - raise NotImplementedError() + raise NotImplementedError class BaseTransformation(object): diff --git a/mayan/apps/converter/exceptions.py b/mayan/apps/converter/exceptions.py index 832fe5e1be..617643b40e 100644 --- a/mayan/apps/converter/exceptions.py +++ b/mayan/apps/converter/exceptions.py @@ -15,13 +15,6 @@ class UnknownFileFormat(ConvertError): pass -class IdentifyError(ConvertError): - """ - Raised by the graphcismagick and imagemagics identify program - """ - pass - - class UnkownConvertError(ConvertError): """ Raised when an error is found but there is no disernible way to @@ -34,5 +27,5 @@ class OfficeConversionError(ConvertError): pass -class OfficeBackendError(OfficeConversionError): +class InvalidOfficeFormat(ConvertError): pass diff --git a/mayan/apps/documents/models.py b/mayan/apps/documents/models.py index bca0fb45bc..f3837b3c17 100644 --- a/mayan/apps/documents/models.py +++ b/mayan/apps/documents/models.py @@ -16,9 +16,10 @@ from acls.utils import apply_default_acls from common.settings import setting_temporary_directory from common.utils import fs_cleanup from converter import ( - converter_class, TransformationResize, TransformationRotate, TransformationZoom + converter_class, TransformationResize, TransformationRotate, + TransformationZoom ) -from converter.exceptions import UnknownFileFormat +from converter.exceptions import InvalidOfficeFormat, UnknownFileFormat from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION from converter.models import Transformation from mimetype.api import get_mimetype @@ -395,6 +396,45 @@ class DocumentVersion(models.Model): def page_count(self): return self.pages.count() + @property + def uuid(self): + # Make cache UUID a mix of document UUID, version ID + return '{}-{}'.format(self.document.uuid, self.pk) + + @property + def cache_filename(self): + return os.path.join(setting_cache_path.value, 'document-version-{}'.format(self.uuid)) + + def get_intermidiate_file(self): + cache_filename = self.cache_filename + logger.debug('Intermidiate filename: %s', cache_filename) + + if os.path.exists(cache_filename): + logger.debug('Intermidiate file "%s" found.', cache_filename) + + return open(cache_filename) + #converter = converter_class(file_object=open(cache_filename)) + #converter.seek(0) + else: + logger.debug('Intermidiate file "%s" not found.', cache_filename) + + try: + converter = converter_class(file_object=self.open()) + pdf_file_object = converter.to_pdf() + + with open(cache_filename, 'wb+') as file_object: + for chunk in pdf_file_object: + file_object.write(chunk) + + return open(cache_filename) + except InvalidOfficeFormat: + return self.open() + except Exception as exception: + # Cleanup in case of error + logger.error('Error creating intermediate file "%s"; %s.', cache_filename, exception) + fs_cleanup(cache_filename) + raise + @python_2_unicode_compatible class DocumentTypeFilename(models.Model): @@ -455,13 +495,17 @@ class DocumentPage(models.Model): def invalidate_cache(self): fs_cleanup(self.get_cache_filename()) - def get_uuid(self): - # Make cache UUID a mix of document UUID, version ID and page ID to - # avoid using stale images - return 'page-cache-{}-{}-{}'.format(self.document.uuid, self.document_version.pk, self.pk) + @property + def uuid(self): + """ + Make cache UUID a mix of version ID and page ID to avoid using stale + images + """ + return '{}-{}'.format(self.document_version.uuid, self.pk) - def get_cache_filename(self): - return os.path.join(setting_cache_path.value, self.get_uuid()) + @property + def cache_filename(self): + return os.path.join(setting_cache_path.value, 'page-cache-{}'.format(self.uuid)) def get_image(self, *args, **kwargs): as_base64 = kwargs.pop('as_base64', False) @@ -478,22 +522,27 @@ class DocumentPage(models.Model): rotation = rotation % 360 - cache_filename = self.get_cache_filename() + cache_filename = self.cache_filename + logger.debug('Page cache filename: %s', cache_filename) if os.path.exists(cache_filename): + logger.debug('Page cache file "%s" found', cache_filename) converter = converter_class(file_object=open(cache_filename)) converter.seek(0) else: + logger.debug('Page cache file "%s" not found', cache_filename) + try: - converter = converter_class(file_object=self.document_version.open()) + converter = converter_class(file_object=self.document_version.get_intermidiate_file()) converter.seek(page_number=self.page_number - 1) page_image = converter.get_page() with open(cache_filename, 'wb+') as file_object: file_object.write(page_image.getvalue()) - except: + except Exception as exception: # Cleanup in case of error + logger.error('Error creating page cache file "%s".', cache_filename) fs_cleanup(cache_filename) raise diff --git a/mayan/apps/navigation/classes.py b/mayan/apps/navigation/classes.py index 0b3b80a94a..df62d56a03 100644 --- a/mayan/apps/navigation/classes.py +++ b/mayan/apps/navigation/classes.py @@ -263,6 +263,9 @@ class SourceColumn(object): return cls._registry[source.__class__] except KeyError: return () + except TypeError: + # unhashable type: list + return () def __init__(self, source, label, attribute): self.__class__._registry.setdefault(source, [])