Convert and cache office documents at the document version level for faster page image retrieval

This commit is contained in:
Roberto Rosario
2015-06-24 01:04:35 -04:00
parent 3d68e79654
commit b18888b3f7
7 changed files with 116 additions and 69 deletions

View File

@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import subprocess import subprocess
from ..classes import ConverterBase from ..classes import ConverterBase
from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat from ..exceptions import ConvertError, UnknownFileFormat
from ..literals import ( from ..literals import (
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
) )

View File

@@ -3,7 +3,7 @@ from __future__ import unicode_literals
import subprocess import subprocess
from ..classes import ConverterBase from ..classes import ConverterBase
from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat from ..exceptions import ConvertError, UnknownFileFormat
from ..literals import ( from ..literals import (
DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER, DIMENSION_SEPARATOR, DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER, DIMENSION_SEPARATOR,
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM

View File

@@ -39,10 +39,6 @@ class Python(ConverterBase):
new_file_object, input_filepath = tempfile.mkstemp() new_file_object, input_filepath = tempfile.mkstemp()
if self.soffice_file_object:
os.write(new_file_object, self.soffice_file_object.read())
self.soffice_file_object.close()
else:
os.write(new_file_object, self.file_object.read()) os.write(new_file_object, self.file_object.read())
self.file_object.seek(0) self.file_object.seek(0)
@@ -57,6 +53,8 @@ class Python(ConverterBase):
fs_cleanup(input_filepath) fs_cleanup(input_filepath)
def get_page_count(self): def get_page_count(self):
super(Python, self).get_page_count()
page_count = 1 page_count = 1
if self.mime_type == 'application/pdf': if self.mime_type == 'application/pdf':
@@ -64,14 +62,13 @@ class Python(ConverterBase):
try: try:
pages = slate.PDF(self.file_object) pages = slate.PDF(self.file_object)
except Exception as exception: except Exception as exception:
logger.error('slate exception; %s', exception) logger.error('Slate exception; %s', exception)
return 1 raise
# TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description
else: else:
return len(pages) return len(pages)
finally: finally:
self.file_object.seek(0) self.file_object.seek(0)
else:
try: try:
image = Image.open(self.file_object) image = Image.open(self.file_object)
finally: finally:

View File

@@ -18,11 +18,15 @@ from common.settings import setting_temporary_directory
from common.utils import fs_cleanup from common.utils import fs_cleanup
from mimetype.api import get_mimetype from mimetype.api import get_mimetype
from .exceptions import OfficeConversionError from .exceptions import InvalidOfficeFormat, OfficeConversionError
from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT
from .settings import setting_libreoffice_path from .settings import setting_libreoffice_path
CONVERTER_OFFICE_FILE_MIMETYPES = [ CHUNK_SIZE = 1024
logger = logging.getLogger(__name__)
CONVERTER_OFFICE_FILE_MIMETYPES = (
'application/msword', 'application/msword',
'application/mswrite', 'application/mswrite',
'application/mspowerpoint', 'application/mspowerpoint',
@@ -67,9 +71,7 @@ CONVERTER_OFFICE_FILE_MIMETYPES = [
'text/x-shellscript', 'text/x-shellscript',
'text/plain', 'text/plain',
'text/rtf', 'text/rtf',
] )
logger = logging.getLogger(__name__)
class ConverterBase(object): class ConverterBase(object):
@staticmethod @staticmethod
@@ -78,11 +80,15 @@ class ConverterBase(object):
Executes libreoffice using subprocess's Popen Executes libreoffice using subprocess's Popen
""" """
if not os.path.exists(setting_libreoffice_path.value):
raise OfficeConversionError(_('LibreOffice not installed or not found at path: %s') % setting_libreoffice_path.value)
new_file_object, input_filepath = tempfile.mkstemp() new_file_object, input_filepath = tempfile.mkstemp()
new_file_object.write(file_object.read())
file_object.seek(0) file_object.seek(0)
new_file_object.seek(0) os.write(new_file_object, file_object.read())
new_file_object.close() file_object.seek(0)
os.lseek(new_file_object, 0, os.SEEK_SET)
os.close(new_file_object)
command = [] command = []
command.append(setting_libreoffice_path.value) command.append(setting_libreoffice_path.value)
@@ -100,9 +106,11 @@ class ConverterBase(object):
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
return_code = proc.wait() return_code = proc.wait()
logger.debug('return_code: %s', return_code) logger.debug('return_code: %s', return_code)
fs_cleanup(input_filepath)
readline = proc.stderr.readline() readline = proc.stderr.readline()
logger.debug('stderr: %s', readline) logger.debug('stderr: %s', readline)
if return_code != 0: if return_code != 0:
raise OfficeConversionError(readline) raise OfficeConversionError(readline)
@@ -113,7 +121,14 @@ class ConverterBase(object):
converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join([filename, 'pdf'])) converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join([filename, 'pdf']))
logger.debug('converted_output: %s', converted_output) logger.debug('converted_output: %s', converted_output)
return converted_output with open(converted_output) as converted_file_object:
while True:
data = converted_file_object.read(CHUNK_SIZE)
if not data:
break
yield data
fs_cleanup(input_filepath)
def __init__(self, file_object, mime_type=None): def __init__(self, file_object, mime_type=None):
self.file_object = file_object self.file_object = file_object
@@ -121,6 +136,12 @@ class ConverterBase(object):
self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0] self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0]
self.soffice_file_object = None self.soffice_file_object = None
def to_pdf(self):
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
return ConverterBase.soffice(self.file_object)
else:
raise InvalidOfficeFormat(_('Not an office file format.'))
def seek(self, page_number): def seek(self, page_number):
# Starting with #0 # Starting with #0
self.file_object.seek(0) self.file_object.seek(0)
@@ -147,22 +168,6 @@ class ConverterBase(object):
def convert(self, page_number=DEFAULT_PAGE_NUMBER): def convert(self, page_number=DEFAULT_PAGE_NUMBER):
self.page_number = page_number self.page_number = page_number
self.mime_type = 'application/pdf'
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
if os.path.exists(setting_libreoffice_path.value):
if not self.soffice_file_object:
converted_output = ConverterBase.soffice(self.file_object)
self.file_object.seek(0)
self.soffice_file_object = open(converted_output)
self.mime_type = 'application/pdf'
fs_cleanup(converted_output)
else:
self.soffice_file_object.seek(0)
else:
# TODO: NO LIBREOFFICE FOUND ERROR
pass
def transform(self, transformation): def transform(self, transformation):
if not self.image: if not self.image:
self.seek(0) self.seek(0)
@@ -177,7 +182,7 @@ class ConverterBase(object):
self.image = transformation.execute_on(self.image) self.image = transformation.execute_on(self.image)
def get_page_count(self): def get_page_count(self):
raise NotImplementedError() raise NotImplementedError
class BaseTransformation(object): class BaseTransformation(object):

View File

@@ -15,13 +15,6 @@ class UnknownFileFormat(ConvertError):
pass pass
class IdentifyError(ConvertError):
"""
Raised by the graphcismagick and imagemagics identify program
"""
pass
class UnkownConvertError(ConvertError): class UnkownConvertError(ConvertError):
""" """
Raised when an error is found but there is no disernible way to Raised when an error is found but there is no disernible way to
@@ -34,5 +27,5 @@ class OfficeConversionError(ConvertError):
pass pass
class OfficeBackendError(OfficeConversionError): class InvalidOfficeFormat(ConvertError):
pass pass

View File

@@ -16,9 +16,10 @@ from acls.utils import apply_default_acls
from common.settings import setting_temporary_directory from common.settings import setting_temporary_directory
from common.utils import fs_cleanup from common.utils import fs_cleanup
from converter import ( from converter import (
converter_class, TransformationResize, TransformationRotate, TransformationZoom converter_class, TransformationResize, TransformationRotate,
TransformationZoom
) )
from converter.exceptions import UnknownFileFormat from converter.exceptions import InvalidOfficeFormat, UnknownFileFormat
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION
from converter.models import Transformation from converter.models import Transformation
from mimetype.api import get_mimetype from mimetype.api import get_mimetype
@@ -395,6 +396,45 @@ class DocumentVersion(models.Model):
def page_count(self): def page_count(self):
return self.pages.count() return self.pages.count()
@property
def uuid(self):
# Make cache UUID a mix of document UUID, version ID
return '{}-{}'.format(self.document.uuid, self.pk)
@property
def cache_filename(self):
return os.path.join(setting_cache_path.value, 'document-version-{}'.format(self.uuid))
def get_intermidiate_file(self):
cache_filename = self.cache_filename
logger.debug('Intermidiate filename: %s', cache_filename)
if os.path.exists(cache_filename):
logger.debug('Intermidiate file "%s" found.', cache_filename)
return open(cache_filename)
#converter = converter_class(file_object=open(cache_filename))
#converter.seek(0)
else:
logger.debug('Intermidiate file "%s" not found.', cache_filename)
try:
converter = converter_class(file_object=self.open())
pdf_file_object = converter.to_pdf()
with open(cache_filename, 'wb+') as file_object:
for chunk in pdf_file_object:
file_object.write(chunk)
return open(cache_filename)
except InvalidOfficeFormat:
return self.open()
except Exception as exception:
# Cleanup in case of error
logger.error('Error creating intermediate file "%s"; %s.', cache_filename, exception)
fs_cleanup(cache_filename)
raise
@python_2_unicode_compatible @python_2_unicode_compatible
class DocumentTypeFilename(models.Model): class DocumentTypeFilename(models.Model):
@@ -455,13 +495,17 @@ class DocumentPage(models.Model):
def invalidate_cache(self): def invalidate_cache(self):
fs_cleanup(self.get_cache_filename()) fs_cleanup(self.get_cache_filename())
def get_uuid(self): @property
# Make cache UUID a mix of document UUID, version ID and page ID to def uuid(self):
# avoid using stale images """
return 'page-cache-{}-{}-{}'.format(self.document.uuid, self.document_version.pk, self.pk) Make cache UUID a mix of version ID and page ID to avoid using stale
images
"""
return '{}-{}'.format(self.document_version.uuid, self.pk)
def get_cache_filename(self): @property
return os.path.join(setting_cache_path.value, self.get_uuid()) def cache_filename(self):
return os.path.join(setting_cache_path.value, 'page-cache-{}'.format(self.uuid))
def get_image(self, *args, **kwargs): def get_image(self, *args, **kwargs):
as_base64 = kwargs.pop('as_base64', False) as_base64 = kwargs.pop('as_base64', False)
@@ -478,22 +522,27 @@ class DocumentPage(models.Model):
rotation = rotation % 360 rotation = rotation % 360
cache_filename = self.get_cache_filename() cache_filename = self.cache_filename
logger.debug('Page cache filename: %s', cache_filename)
if os.path.exists(cache_filename): if os.path.exists(cache_filename):
logger.debug('Page cache file "%s" found', cache_filename)
converter = converter_class(file_object=open(cache_filename)) converter = converter_class(file_object=open(cache_filename))
converter.seek(0) converter.seek(0)
else: else:
logger.debug('Page cache file "%s" not found', cache_filename)
try: try:
converter = converter_class(file_object=self.document_version.open()) converter = converter_class(file_object=self.document_version.get_intermidiate_file())
converter.seek(page_number=self.page_number - 1) converter.seek(page_number=self.page_number - 1)
page_image = converter.get_page() page_image = converter.get_page()
with open(cache_filename, 'wb+') as file_object: with open(cache_filename, 'wb+') as file_object:
file_object.write(page_image.getvalue()) file_object.write(page_image.getvalue())
except: except Exception as exception:
# Cleanup in case of error # Cleanup in case of error
logger.error('Error creating page cache file "%s".', cache_filename)
fs_cleanup(cache_filename) fs_cleanup(cache_filename)
raise raise

View File

@@ -263,6 +263,9 @@ class SourceColumn(object):
return cls._registry[source.__class__] return cls._registry[source.__class__]
except KeyError: except KeyError:
return () return ()
except TypeError:
# unhashable type: list
return ()
def __init__(self, source, label, attribute): def __init__(self, source, label, attribute):
self.__class__._registry.setdefault(source, []) self.__class__._registry.setdefault(source, [])