Convert and cache office documents at the document version level for faster page image retrieval
This commit is contained in:
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from ..classes import ConverterBase
|
from ..classes import ConverterBase
|
||||||
from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat
|
from ..exceptions import ConvertError, UnknownFileFormat
|
||||||
from ..literals import (
|
from ..literals import (
|
||||||
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ from __future__ import unicode_literals
|
|||||||
import subprocess
|
import subprocess
|
||||||
|
|
||||||
from ..classes import ConverterBase
|
from ..classes import ConverterBase
|
||||||
from ..exceptions import ConvertError, IdentifyError, UnknownFileFormat
|
from ..exceptions import ConvertError, UnknownFileFormat
|
||||||
from ..literals import (
|
from ..literals import (
|
||||||
DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER, DIMENSION_SEPARATOR,
|
DEFAULT_FILE_FORMAT, DEFAULT_PAGE_NUMBER, DIMENSION_SEPARATOR,
|
||||||
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
TRANSFORMATION_RESIZE, TRANSFORMATION_ROTATE, TRANSFORMATION_ZOOM
|
||||||
|
|||||||
@@ -39,10 +39,6 @@ class Python(ConverterBase):
|
|||||||
|
|
||||||
new_file_object, input_filepath = tempfile.mkstemp()
|
new_file_object, input_filepath = tempfile.mkstemp()
|
||||||
|
|
||||||
if self.soffice_file_object:
|
|
||||||
os.write(new_file_object, self.soffice_file_object.read())
|
|
||||||
self.soffice_file_object.close()
|
|
||||||
else:
|
|
||||||
os.write(new_file_object, self.file_object.read())
|
os.write(new_file_object, self.file_object.read())
|
||||||
self.file_object.seek(0)
|
self.file_object.seek(0)
|
||||||
|
|
||||||
@@ -57,6 +53,8 @@ class Python(ConverterBase):
|
|||||||
fs_cleanup(input_filepath)
|
fs_cleanup(input_filepath)
|
||||||
|
|
||||||
def get_page_count(self):
|
def get_page_count(self):
|
||||||
|
super(Python, self).get_page_count()
|
||||||
|
|
||||||
page_count = 1
|
page_count = 1
|
||||||
|
|
||||||
if self.mime_type == 'application/pdf':
|
if self.mime_type == 'application/pdf':
|
||||||
@@ -64,14 +62,13 @@ class Python(ConverterBase):
|
|||||||
try:
|
try:
|
||||||
pages = slate.PDF(self.file_object)
|
pages = slate.PDF(self.file_object)
|
||||||
except Exception as exception:
|
except Exception as exception:
|
||||||
logger.error('slate exception; %s', exception)
|
logger.error('Slate exception; %s', exception)
|
||||||
return 1
|
raise
|
||||||
# TODO: Maybe return UnknownFileFormat to display proper unknwon file format message in document description
|
|
||||||
else:
|
else:
|
||||||
return len(pages)
|
return len(pages)
|
||||||
finally:
|
finally:
|
||||||
self.file_object.seek(0)
|
self.file_object.seek(0)
|
||||||
|
else:
|
||||||
try:
|
try:
|
||||||
image = Image.open(self.file_object)
|
image = Image.open(self.file_object)
|
||||||
finally:
|
finally:
|
||||||
|
|||||||
@@ -18,11 +18,15 @@ from common.settings import setting_temporary_directory
|
|||||||
from common.utils import fs_cleanup
|
from common.utils import fs_cleanup
|
||||||
from mimetype.api import get_mimetype
|
from mimetype.api import get_mimetype
|
||||||
|
|
||||||
from .exceptions import OfficeConversionError
|
from .exceptions import InvalidOfficeFormat, OfficeConversionError
|
||||||
from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT
|
from .literals import DEFAULT_PAGE_NUMBER, DEFAULT_FILE_FORMAT
|
||||||
from .settings import setting_libreoffice_path
|
from .settings import setting_libreoffice_path
|
||||||
|
|
||||||
CONVERTER_OFFICE_FILE_MIMETYPES = [
|
CHUNK_SIZE = 1024
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
CONVERTER_OFFICE_FILE_MIMETYPES = (
|
||||||
'application/msword',
|
'application/msword',
|
||||||
'application/mswrite',
|
'application/mswrite',
|
||||||
'application/mspowerpoint',
|
'application/mspowerpoint',
|
||||||
@@ -67,9 +71,7 @@ CONVERTER_OFFICE_FILE_MIMETYPES = [
|
|||||||
'text/x-shellscript',
|
'text/x-shellscript',
|
||||||
'text/plain',
|
'text/plain',
|
||||||
'text/rtf',
|
'text/rtf',
|
||||||
]
|
)
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
|
|
||||||
class ConverterBase(object):
|
class ConverterBase(object):
|
||||||
@staticmethod
|
@staticmethod
|
||||||
@@ -78,11 +80,15 @@ class ConverterBase(object):
|
|||||||
Executes libreoffice using subprocess's Popen
|
Executes libreoffice using subprocess's Popen
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
if not os.path.exists(setting_libreoffice_path.value):
|
||||||
|
raise OfficeConversionError(_('LibreOffice not installed or not found at path: %s') % setting_libreoffice_path.value)
|
||||||
|
|
||||||
new_file_object, input_filepath = tempfile.mkstemp()
|
new_file_object, input_filepath = tempfile.mkstemp()
|
||||||
new_file_object.write(file_object.read())
|
|
||||||
file_object.seek(0)
|
file_object.seek(0)
|
||||||
new_file_object.seek(0)
|
os.write(new_file_object, file_object.read())
|
||||||
new_file_object.close()
|
file_object.seek(0)
|
||||||
|
os.lseek(new_file_object, 0, os.SEEK_SET)
|
||||||
|
os.close(new_file_object)
|
||||||
|
|
||||||
command = []
|
command = []
|
||||||
command.append(setting_libreoffice_path.value)
|
command.append(setting_libreoffice_path.value)
|
||||||
@@ -100,9 +106,11 @@ class ConverterBase(object):
|
|||||||
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
|
||||||
return_code = proc.wait()
|
return_code = proc.wait()
|
||||||
logger.debug('return_code: %s', return_code)
|
logger.debug('return_code: %s', return_code)
|
||||||
|
fs_cleanup(input_filepath)
|
||||||
|
|
||||||
readline = proc.stderr.readline()
|
readline = proc.stderr.readline()
|
||||||
logger.debug('stderr: %s', readline)
|
logger.debug('stderr: %s', readline)
|
||||||
|
|
||||||
if return_code != 0:
|
if return_code != 0:
|
||||||
raise OfficeConversionError(readline)
|
raise OfficeConversionError(readline)
|
||||||
|
|
||||||
@@ -113,7 +121,14 @@ class ConverterBase(object):
|
|||||||
converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join([filename, 'pdf']))
|
converted_output = os.path.join(setting_temporary_directory.value, os.path.extsep.join([filename, 'pdf']))
|
||||||
logger.debug('converted_output: %s', converted_output)
|
logger.debug('converted_output: %s', converted_output)
|
||||||
|
|
||||||
return converted_output
|
with open(converted_output) as converted_file_object:
|
||||||
|
while True:
|
||||||
|
data = converted_file_object.read(CHUNK_SIZE)
|
||||||
|
if not data:
|
||||||
|
break
|
||||||
|
yield data
|
||||||
|
|
||||||
|
fs_cleanup(input_filepath)
|
||||||
|
|
||||||
def __init__(self, file_object, mime_type=None):
|
def __init__(self, file_object, mime_type=None):
|
||||||
self.file_object = file_object
|
self.file_object = file_object
|
||||||
@@ -121,6 +136,12 @@ class ConverterBase(object):
|
|||||||
self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0]
|
self.mime_type = mime_type or get_mimetype(file_object=file_object, mimetype_only=False)[0]
|
||||||
self.soffice_file_object = None
|
self.soffice_file_object = None
|
||||||
|
|
||||||
|
def to_pdf(self):
|
||||||
|
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
|
||||||
|
return ConverterBase.soffice(self.file_object)
|
||||||
|
else:
|
||||||
|
raise InvalidOfficeFormat(_('Not an office file format.'))
|
||||||
|
|
||||||
def seek(self, page_number):
|
def seek(self, page_number):
|
||||||
# Starting with #0
|
# Starting with #0
|
||||||
self.file_object.seek(0)
|
self.file_object.seek(0)
|
||||||
@@ -147,22 +168,6 @@ class ConverterBase(object):
|
|||||||
def convert(self, page_number=DEFAULT_PAGE_NUMBER):
|
def convert(self, page_number=DEFAULT_PAGE_NUMBER):
|
||||||
self.page_number = page_number
|
self.page_number = page_number
|
||||||
|
|
||||||
self.mime_type = 'application/pdf'
|
|
||||||
|
|
||||||
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:
|
|
||||||
if os.path.exists(setting_libreoffice_path.value):
|
|
||||||
if not self.soffice_file_object:
|
|
||||||
converted_output = ConverterBase.soffice(self.file_object)
|
|
||||||
self.file_object.seek(0)
|
|
||||||
self.soffice_file_object = open(converted_output)
|
|
||||||
self.mime_type = 'application/pdf'
|
|
||||||
fs_cleanup(converted_output)
|
|
||||||
else:
|
|
||||||
self.soffice_file_object.seek(0)
|
|
||||||
else:
|
|
||||||
# TODO: NO LIBREOFFICE FOUND ERROR
|
|
||||||
pass
|
|
||||||
|
|
||||||
def transform(self, transformation):
|
def transform(self, transformation):
|
||||||
if not self.image:
|
if not self.image:
|
||||||
self.seek(0)
|
self.seek(0)
|
||||||
@@ -177,7 +182,7 @@ class ConverterBase(object):
|
|||||||
self.image = transformation.execute_on(self.image)
|
self.image = transformation.execute_on(self.image)
|
||||||
|
|
||||||
def get_page_count(self):
|
def get_page_count(self):
|
||||||
raise NotImplementedError()
|
raise NotImplementedError
|
||||||
|
|
||||||
|
|
||||||
class BaseTransformation(object):
|
class BaseTransformation(object):
|
||||||
|
|||||||
@@ -15,13 +15,6 @@ class UnknownFileFormat(ConvertError):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class IdentifyError(ConvertError):
|
|
||||||
"""
|
|
||||||
Raised by the graphcismagick and imagemagics identify program
|
|
||||||
"""
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
class UnkownConvertError(ConvertError):
|
class UnkownConvertError(ConvertError):
|
||||||
"""
|
"""
|
||||||
Raised when an error is found but there is no disernible way to
|
Raised when an error is found but there is no disernible way to
|
||||||
@@ -34,5 +27,5 @@ class OfficeConversionError(ConvertError):
|
|||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
class OfficeBackendError(OfficeConversionError):
|
class InvalidOfficeFormat(ConvertError):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -16,9 +16,10 @@ from acls.utils import apply_default_acls
|
|||||||
from common.settings import setting_temporary_directory
|
from common.settings import setting_temporary_directory
|
||||||
from common.utils import fs_cleanup
|
from common.utils import fs_cleanup
|
||||||
from converter import (
|
from converter import (
|
||||||
converter_class, TransformationResize, TransformationRotate, TransformationZoom
|
converter_class, TransformationResize, TransformationRotate,
|
||||||
|
TransformationZoom
|
||||||
)
|
)
|
||||||
from converter.exceptions import UnknownFileFormat
|
from converter.exceptions import InvalidOfficeFormat, UnknownFileFormat
|
||||||
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION
|
from converter.literals import DEFAULT_ZOOM_LEVEL, DEFAULT_ROTATION
|
||||||
from converter.models import Transformation
|
from converter.models import Transformation
|
||||||
from mimetype.api import get_mimetype
|
from mimetype.api import get_mimetype
|
||||||
@@ -395,6 +396,45 @@ class DocumentVersion(models.Model):
|
|||||||
def page_count(self):
|
def page_count(self):
|
||||||
return self.pages.count()
|
return self.pages.count()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def uuid(self):
|
||||||
|
# Make cache UUID a mix of document UUID, version ID
|
||||||
|
return '{}-{}'.format(self.document.uuid, self.pk)
|
||||||
|
|
||||||
|
@property
|
||||||
|
def cache_filename(self):
|
||||||
|
return os.path.join(setting_cache_path.value, 'document-version-{}'.format(self.uuid))
|
||||||
|
|
||||||
|
def get_intermidiate_file(self):
|
||||||
|
cache_filename = self.cache_filename
|
||||||
|
logger.debug('Intermidiate filename: %s', cache_filename)
|
||||||
|
|
||||||
|
if os.path.exists(cache_filename):
|
||||||
|
logger.debug('Intermidiate file "%s" found.', cache_filename)
|
||||||
|
|
||||||
|
return open(cache_filename)
|
||||||
|
#converter = converter_class(file_object=open(cache_filename))
|
||||||
|
#converter.seek(0)
|
||||||
|
else:
|
||||||
|
logger.debug('Intermidiate file "%s" not found.', cache_filename)
|
||||||
|
|
||||||
|
try:
|
||||||
|
converter = converter_class(file_object=self.open())
|
||||||
|
pdf_file_object = converter.to_pdf()
|
||||||
|
|
||||||
|
with open(cache_filename, 'wb+') as file_object:
|
||||||
|
for chunk in pdf_file_object:
|
||||||
|
file_object.write(chunk)
|
||||||
|
|
||||||
|
return open(cache_filename)
|
||||||
|
except InvalidOfficeFormat:
|
||||||
|
return self.open()
|
||||||
|
except Exception as exception:
|
||||||
|
# Cleanup in case of error
|
||||||
|
logger.error('Error creating intermediate file "%s"; %s.', cache_filename, exception)
|
||||||
|
fs_cleanup(cache_filename)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
@python_2_unicode_compatible
|
@python_2_unicode_compatible
|
||||||
class DocumentTypeFilename(models.Model):
|
class DocumentTypeFilename(models.Model):
|
||||||
@@ -455,13 +495,17 @@ class DocumentPage(models.Model):
|
|||||||
def invalidate_cache(self):
|
def invalidate_cache(self):
|
||||||
fs_cleanup(self.get_cache_filename())
|
fs_cleanup(self.get_cache_filename())
|
||||||
|
|
||||||
def get_uuid(self):
|
@property
|
||||||
# Make cache UUID a mix of document UUID, version ID and page ID to
|
def uuid(self):
|
||||||
# avoid using stale images
|
"""
|
||||||
return 'page-cache-{}-{}-{}'.format(self.document.uuid, self.document_version.pk, self.pk)
|
Make cache UUID a mix of version ID and page ID to avoid using stale
|
||||||
|
images
|
||||||
|
"""
|
||||||
|
return '{}-{}'.format(self.document_version.uuid, self.pk)
|
||||||
|
|
||||||
def get_cache_filename(self):
|
@property
|
||||||
return os.path.join(setting_cache_path.value, self.get_uuid())
|
def cache_filename(self):
|
||||||
|
return os.path.join(setting_cache_path.value, 'page-cache-{}'.format(self.uuid))
|
||||||
|
|
||||||
def get_image(self, *args, **kwargs):
|
def get_image(self, *args, **kwargs):
|
||||||
as_base64 = kwargs.pop('as_base64', False)
|
as_base64 = kwargs.pop('as_base64', False)
|
||||||
@@ -478,22 +522,27 @@ class DocumentPage(models.Model):
|
|||||||
|
|
||||||
rotation = rotation % 360
|
rotation = rotation % 360
|
||||||
|
|
||||||
cache_filename = self.get_cache_filename()
|
cache_filename = self.cache_filename
|
||||||
|
logger.debug('Page cache filename: %s', cache_filename)
|
||||||
|
|
||||||
if os.path.exists(cache_filename):
|
if os.path.exists(cache_filename):
|
||||||
|
logger.debug('Page cache file "%s" found', cache_filename)
|
||||||
converter = converter_class(file_object=open(cache_filename))
|
converter = converter_class(file_object=open(cache_filename))
|
||||||
|
|
||||||
converter.seek(0)
|
converter.seek(0)
|
||||||
else:
|
else:
|
||||||
|
logger.debug('Page cache file "%s" not found', cache_filename)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
converter = converter_class(file_object=self.document_version.open())
|
converter = converter_class(file_object=self.document_version.get_intermidiate_file())
|
||||||
converter.seek(page_number=self.page_number - 1)
|
converter.seek(page_number=self.page_number - 1)
|
||||||
|
|
||||||
page_image = converter.get_page()
|
page_image = converter.get_page()
|
||||||
with open(cache_filename, 'wb+') as file_object:
|
with open(cache_filename, 'wb+') as file_object:
|
||||||
file_object.write(page_image.getvalue())
|
file_object.write(page_image.getvalue())
|
||||||
except:
|
except Exception as exception:
|
||||||
# Cleanup in case of error
|
# Cleanup in case of error
|
||||||
|
logger.error('Error creating page cache file "%s".', cache_filename)
|
||||||
fs_cleanup(cache_filename)
|
fs_cleanup(cache_filename)
|
||||||
raise
|
raise
|
||||||
|
|
||||||
|
|||||||
@@ -263,6 +263,9 @@ class SourceColumn(object):
|
|||||||
return cls._registry[source.__class__]
|
return cls._registry[source.__class__]
|
||||||
except KeyError:
|
except KeyError:
|
||||||
return ()
|
return ()
|
||||||
|
except TypeError:
|
||||||
|
# unhashable type: list
|
||||||
|
return ()
|
||||||
|
|
||||||
def __init__(self, source, label, attribute):
|
def __init__(self, source, label, attribute):
|
||||||
self.__class__._registry.setdefault(source, [])
|
self.__class__._registry.setdefault(source, [])
|
||||||
|
|||||||
Reference in New Issue
Block a user