Files
mayan-edms/mayan/apps/common/compressed_files.py
Roberto Rosario 7889ff64d9 Document zip file encoding issues
Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
2019-10-01 00:20:50 -04:00

199 lines
5.6 KiB
Python

from __future__ import unicode_literals
from io import BytesIO
import tarfile
import zipfile
try:
import zlib # NOQA
COMPRESSION = zipfile.ZIP_DEFLATED
except ImportError:
COMPRESSION = zipfile.ZIP_STORED
from django.core.files.uploadedfile import SimpleUploadedFile
from django.utils.encoding import force_text
from mayan.apps.mimetype.api import get_mimetype
from .exceptions import NoMIMETypeMatch
class Archive(object):
_registry = {}
@classmethod
def register(cls, mime_types, archive_classes):
for mime_type in mime_types:
for archive_class in archive_classes:
cls._registry.setdefault(
mime_type, []
).append(archive_class)
@classmethod
def open(cls, file_object):
mime_type = get_mimetype(
file_object=file_object, mimetype_only=True
)[0]
try:
for archive_class in cls._registry[mime_type]:
instance = archive_class()
instance._open(file_object=file_object)
return instance
except KeyError:
raise NoMIMETypeMatch
def _open(self, file_object):
raise NotImplementedError
def add_file(self, file_object, filename):
"""
Add a file as a member of an archive
"""
raise NotImplementedError
def close(self):
self._archive.close()
def create(self):
"""
Create an empty archive
"""
raise NotImplementedError
def get_members(self):
return (
SimpleUploadedFile(
content=self.member_contents(filename=filename), name=filename
) for filename in self.members()
)
def member_contents(self, filename):
"""
Return the content of a member
"""
raise NotImplementedError
def members(self):
"""
Return a list of all the elements inside the archive
"""
raise NotImplementedError
def open_member(self, filename):
"""
Return a file-like object to a member of the archive
"""
raise NotImplementedError
class TarArchive(Archive):
def _open(self, file_object):
self._archive = tarfile.open(fileobj=file_object)
def add_file(self, file_object, filename):
self._archive.addfile(
tarfile.TarInfo(), fileobj=file_object
)
def create(self):
self.string_buffer = BytesIO()
self._archive = tarfile.TarFile(fileobj=self.string_buffer, mode='w')
def member_contents(self, filename):
return self._archive.extractfile(filename).read()
def members(self):
return self._archive.getnames()
def open_member(self, filename):
return self._archive.extractfile(filename)
class ZipArchive(Archive):
def _open(self, file_object):
self._archive = zipfile.ZipFile(file_object)
def add_file(self, file_object, filename):
# Remove the zinfo_or_arcname and bytes keyword arguments
# so that the writestr methods works on Python 2 and 3
# Python 2 syntax:
# ZipFile.writestr(zinfo_or_arcname, bytes[, compress_type])
# Python 3 syntax:
# ZipFile.writestr(
# zinfo_or_arcname, data, compress_type=None, compresslevel=None
# )
# TODO: Change this to keyword arguments when the move to Python 3
# and Django 2.x is complete.
self._archive.writestr(
filename, file_object.read(), compress_type=COMPRESSION
)
def create(self):
self.string_buffer = BytesIO()
self._archive = zipfile.ZipFile(self.string_buffer, mode='w')
def member_contents(self, filename):
return self._archive.read(filename)
def members(self):
results = []
from django.utils.encoding import force_str
for filename in self._archive.namelist():
# Zip files only support UTF-8 and CP437 encodings.
# Attempt to decode CP437 to be able to check if it ends
# with a slash.
# Future improvement that violates the Zip format:
# Add chardet.detect to detect the most likely encoding
# if other than CP437.
try:
filename = filename.decode('CP437')
is_unicode = False
except UnicodeEncodeError:
is_unicode = True
if not filename.endswith('/'):
# Re encode in the original encoding
if not is_unicode:
filename = filename.encode(
encoding='CP437', errors='strict'
)
results.append(filename)
return results
def open_member(self, filename):
return self._archive.open(filename)
def write(self, filename=None):
# fix for Linux zip files read in Windows
for entry in self._archive.filelist:
entry.create_system = 0
self.string_buffer.seek(0)
if filename:
with open(filename, 'w') as file_object:
file_object.write(self.string_buffer.read())
else:
return self.string_buffer
def as_file(self, filename):
return SimpleUploadedFile(name=filename, content=self.write().read())
Archive.register(
archive_classes=(ZipArchive,), mime_types=('application/zip',)
)
Archive.register(
archive_classes=(TarArchive,), mime_types=('application/x-tar',)
)
Archive.register(
archive_classes=(TarArchive,), mime_types=('application/gzip',)
)
Archive.register(
archive_classes=(TarArchive,), mime_types=('application/x-bzip2',)
)