Remove converter.to_pdf iterator

Remove the custom iterator to return the result of a conversion to PDF.
Instead returns a file object which can then be copied around
using shutil.copyfileobj.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2019-05-14 01:58:49 -04:00
parent 8b073c3151
commit 4e5c513529
3 changed files with 84 additions and 76 deletions

View File

@@ -70,16 +70,6 @@ except sh.CommandNotFound:
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class IteratorIO(object):
def __init__(self, iterator):
self.file_buffer = io.BytesIO()
for chunk in iterator:
self.file_buffer.write(chunk)
self.file_buffer.seek(0)
class Python(ConverterBase): class Python(ConverterBase):
def convert(self, *args, **kwargs): def convert(self, *args, **kwargs):
super(Python, self).convert(*args, **kwargs) super(Python, self).convert(*args, **kwargs)
@@ -142,7 +132,7 @@ class Python(ConverterBase):
if self.mime_type == 'application/pdf' or self.soffice_file: if self.mime_type == 'application/pdf' or self.soffice_file:
if self.soffice_file: if self.soffice_file:
file_object = IteratorIO(self.soffice_file).file_buffer file_object = self.soffice_file
else: else:
file_object = self.file_object file_object = self.file_object

View File

@@ -168,66 +168,81 @@ class ConverterBase(object):
_('LibreOffice not installed or not found.') _('LibreOffice not installed or not found.')
) )
new_file_object = NamedTemporaryFile() with NamedTemporaryFile() as temporary_file_object:
input_filepath = new_file_object.name # Copy the source file object of the converter instance to a
self.file_object.seek(0) # named temporary file to be able to pass it to the LibreOffice
shutil.copyfileobj(fsrc=self.file_object, fdst=new_file_object) # execution.
self.file_object.seek(0) self.file_object.seek(0)
new_file_object.seek(0) shutil.copyfileobj(
fsrc=self.file_object, fdst=temporary_file_object
libreoffice_filter = None
if self.mime_type == 'text/plain':
libreoffice_filter = 'Text (encoded):UTF8,LF,,,'
libreoffice_home_directory = mkdtemp()
args = (
input_filepath, '--outdir', setting_temporary_directory.value,
'-env:UserInstallation=file://{}'.format(
os.path.join(
libreoffice_home_directory, 'LibreOffice_Conversion'
)
),
)
kwargs = {'_env': {'HOME': libreoffice_home_directory}}
if libreoffice_filter:
kwargs.update({'infilter': libreoffice_filter})
try:
LIBREOFFICE(*args, **kwargs)
except sh.ErrorReturnCode as exception:
new_file_object.close()
raise OfficeConversionError(exception)
except Exception as exception:
new_file_object.close()
logger.error('Exception launching Libre Office; %s', exception)
raise
finally:
fs_cleanup(libreoffice_home_directory)
filename, extension = os.path.splitext(
os.path.basename(input_filepath)
)
logger.debug('filename: %s', filename)
logger.debug('extension: %s', extension)
converted_output = os.path.join(
setting_temporary_directory.value, os.path.extsep.join(
(filename, 'pdf')
) )
) self.file_object.seek(0)
logger.debug('converted_output: %s', converted_output) temporary_file_object.seek(0)
with open(converted_output, mode='rb') as converted_file_object: libreoffice_home_directory = mkdtemp()
while True: args = (
data = converted_file_object.read(CHUNK_SIZE) temporary_file_object.name, '--outdir', setting_temporary_directory.value,
if not data: '-env:UserInstallation=file://{}'.format(
break os.path.join(
yield data libreoffice_home_directory, 'LibreOffice_Conversion'
)
),
)
new_file_object.close() kwargs = {'_env': {'HOME': libreoffice_home_directory}}
fs_cleanup(converted_output)
if self.mime_type == 'text/plain':
kwargs.update(
{'infilter': 'Text (encoded):UTF8,LF,,,'}
)
try:
LIBREOFFICE(*args, **kwargs)
except sh.ErrorReturnCode as exception:
temporary_file_object.close()
raise OfficeConversionError(exception)
except Exception as exception:
temporary_file_object.close()
logger.error('Exception launching Libre Office; %s', exception)
raise
finally:
fs_cleanup(libreoffice_home_directory)
# LibreOffice return a PDF file with the same name as the input
# provided but with the .pdf extension.
# Get the converted output file path out of the temporary file
# name plus the temporary directory
filename, extension = os.path.splitext(
os.path.basename(temporary_file_object.name)
)
logger.debug('filename: %s', filename)
logger.debug('extension: %s', extension)
converted_file_path = os.path.join(
setting_temporary_directory.value, os.path.extsep.join(
(filename, 'pdf')
)
)
logger.debug('converted_file_path: %s', converted_file_path)
# Don't use context manager with the NamedTemporaryFile on purpose
# so that it is deleted when the caller closes the file and not
# before.
temporary_converted_file_object = NamedTemporaryFile()
# Copy the LibreOffice output file to a new named temporary file
# and delete the converted file
with open(converted_file_path, mode='rb') as converted_file_object:
shutil.copyfileobj(
fsrc=converted_file_object, fdst=temporary_converted_file_object
)
fs_cleanup(converted_file_path)
temporary_converted_file_object.seek(0)
return temporary_converted_file_object
def to_pdf(self): def to_pdf(self):
if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES: if self.mime_type in CONVERTER_OFFICE_FILE_MIMETYPES:

View File

@@ -176,16 +176,19 @@ class DocumentVersion(models.Model):
try: try:
converter = get_converter_class()(file_object=self.open()) converter = get_converter_class()(file_object=self.open())
pdf_file_object = converter.to_pdf() with converter.to_pdf() as pdf_file_object:
# Since open "wb+" doesn't create files, check if the file # Since open "wb+" doesn't create files, check if the file
# exists, if not then create it # exists, if not then create it
if not storage_documentimagecache.exists(cache_filename): if not storage_documentimagecache.exists(cache_filename):
storage_documentimagecache.save(name=cache_filename, content=ContentFile(content='')) storage_documentimagecache.save(
name=cache_filename, content=ContentFile(content='')
)
with storage_documentimagecache.open(cache_filename, mode='wb+') as file_object: with storage_documentimagecache.open(cache_filename, mode='wb+') as file_object:
for chunk in pdf_file_object: shutil.copyfileobj(
file_object.write(chunk) fsrc=pdf_file_object, fdst=file_object
)
return storage_documentimagecache.open(cache_filename) return storage_documentimagecache.open(cache_filename)
except InvalidOfficeFormat: except InvalidOfficeFormat: