Add the MIMETYPE_FILE_READ_SIZE setting
This new setting is used to limit the number of bytes read while determining the MIME type of a new document. A value of 0 will cause the entire file to be loaded into memory. 1024 appears to be a suitable number for most cases. This setting defaults to 0 to preserve the current behavior but might change in a future version. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -32,6 +32,9 @@
|
||||
information.
|
||||
* Validate the state completion value before saving. Thanks to Manoel Brunnen
|
||||
(@mbru) for the report and debug information. GitLab issue #557.
|
||||
* Add the MIMETYPE_FILE_READ_SIZE setting to limit the number of bytes read
|
||||
to determine the MIME type of a new document.
|
||||
|
||||
|
||||
3.1.9 (2018-11-01)
|
||||
==================
|
||||
|
||||
@@ -80,6 +80,21 @@ don't contain at least one version, and don't contain a single page. All these
|
||||
are requirements for a valid document in Mayan EDMS.
|
||||
|
||||
|
||||
Memory usage
|
||||
^^^^^^^^^^^^
|
||||
|
||||
The MIMETYPE_FILE_READ_SIZE setting was added to limit the number of bytes that
|
||||
will be read into memory to determine the MIME type of a new document. For
|
||||
compatibility with the current bevahor this setting defaults to 0 which means
|
||||
that it is disabled. Disabling the setting will cause the entire document's
|
||||
file to be loaded into memory. If documents are not processing due to out of
|
||||
memory errors (large documents or devices with limited memory), set
|
||||
MIMETYPE_FILE_READ_SIZE to a value other than 0. Limited tests suggest 1024
|
||||
to be a good alternative as most "magic numbers" used for MIME type detection
|
||||
are located at the start of the file and just reading the first 1024 bytes will
|
||||
result in a positive identification with little memory usage.
|
||||
|
||||
|
||||
Other changes
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
|
||||
@@ -2,24 +2,33 @@ from __future__ import unicode_literals
|
||||
|
||||
import magic
|
||||
|
||||
from .settings import setting_file_read_size
|
||||
|
||||
|
||||
def get_mimetype(file_object, mimetype_only=False):
|
||||
"""
|
||||
Determine a file's mimetype by calling the system's libmagic
|
||||
library via python-magic or fallback to use python's mimetypes
|
||||
library
|
||||
library via python-magic.
|
||||
"""
|
||||
file_mimetype = None
|
||||
file_mime_encoding = None
|
||||
|
||||
read_size = setting_file_read_size.value
|
||||
if read_size == 0:
|
||||
# If the setting value is 0 that means disable read limit. To disable
|
||||
# the read limit passing None won't work, we pass -1 instead as per
|
||||
# the Python documentation.
|
||||
# https://docs.python.org/2/tutorial/inputoutput.html#methods-of-file-objects
|
||||
read_size = -1
|
||||
|
||||
mime = magic.Magic(mime=True)
|
||||
file_mimetype = mime.from_buffer(file_object.read())
|
||||
file_mimetype = mime.from_buffer(file_object.read(read_size))
|
||||
file_object.seek(0)
|
||||
|
||||
if not mimetype_only:
|
||||
file_object.seek(0)
|
||||
mime_encoding = magic.Magic(mime_encoding=True)
|
||||
file_mime_encoding = mime_encoding.from_buffer(file_object.read())
|
||||
file_mime_encoding = mime_encoding.from_buffer(file_object.read(read_size))
|
||||
file_object.seek(0)
|
||||
|
||||
return file_mimetype, file_mime_encoding
|
||||
|
||||
@@ -9,6 +9,7 @@ from .licenses import * # NOQA
|
||||
|
||||
class MIMETypesApp(MayanAppConfig):
|
||||
name = 'mimetype'
|
||||
has_tests = True
|
||||
verbose_name = _('MIME types')
|
||||
|
||||
def ready(self, *args, **kwargs):
|
||||
|
||||
15
mayan/apps/mimetype/settings.py
Normal file
15
mayan/apps/mimetype/settings.py
Normal file
@@ -0,0 +1,15 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
from django.utils.translation import ugettext_lazy as _
|
||||
|
||||
from smart_settings import Namespace
|
||||
|
||||
namespace = Namespace(label=_('MIME type'), name='mimetype')
|
||||
|
||||
setting_file_read_size = namespace.add_setting(
|
||||
default=0, global_name='MIMETYPE_FILE_READ_SIZE', help_text=_(
|
||||
'Amount of bytes to read from a document to determine its MIME type. '
|
||||
'Setting it to 0 disables the feature and attempts to read the entire '
|
||||
'document file into memory.'
|
||||
)
|
||||
)
|
||||
0
mayan/apps/mimetype/tests/__init__.py
Normal file
0
mayan/apps/mimetype/tests/__init__.py
Normal file
38
mayan/apps/mimetype/tests/test_functions.py
Normal file
38
mayan/apps/mimetype/tests/test_functions.py
Normal file
@@ -0,0 +1,38 @@
|
||||
from __future__ import unicode_literals
|
||||
|
||||
import resource
|
||||
|
||||
from django.test import override_settings
|
||||
|
||||
from common.tests import BaseTestCase
|
||||
from documents.models import Document
|
||||
from documents.tests import DocumentTestMixin, TEST_DOCUMENT_FILENAME
|
||||
|
||||
# This constant may need tweaking as document upload code path changes.
|
||||
# The value is targeted at making the document upload process fail exactly
|
||||
# during the MIME type detection phase. Different architectures may need
|
||||
# different values.
|
||||
MAXIMUM_HEAP_MEMORY = 140000000
|
||||
|
||||
|
||||
@override_settings(OCR_AUTO_OCR=False)
|
||||
@override_settings(DOCUMENT_PARSING_AUTO_PARSING=False)
|
||||
class MIMETypeTestCase(DocumentTestMixin, BaseTestCase):
|
||||
auto_upload_document = False
|
||||
test_document_filename = TEST_DOCUMENT_FILENAME
|
||||
|
||||
def setUp(self):
|
||||
super(MIMETypeTestCase, self).setUp()
|
||||
resource.setrlimit(resource.RLIMIT_DATA, (MAXIMUM_HEAP_MEMORY, -1))
|
||||
|
||||
def test_little_memory_full_file(self):
|
||||
with self.assertRaises(Exception):
|
||||
self.upload_document()
|
||||
|
||||
self.assertEqual(Document.objects.count(), 0)
|
||||
|
||||
@override_settings(MIMETYPE_FILE_READ_SIZE=1024)
|
||||
def test_little_memory_partial_file(self):
|
||||
self.upload_document()
|
||||
|
||||
self.assertEqual(Document.objects.count(), 1)
|
||||
Reference in New Issue
Block a user