Add the MIMETYPE_FILE_READ_SIZE setting

This new setting is used to limit the number of bytes read
while determining the MIME type of a new document. A value
of 0 will cause the entire file to be loaded into memory.
1024 appears to be a suitable number for most cases. This
setting defaults to 0 to preserve the current behavior but
might change in a future version.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2019-04-03 22:24:19 -04:00
parent 5f877cdc22
commit a56e3ca111
7 changed files with 85 additions and 4 deletions

View File

@@ -32,6 +32,9 @@
information.
* Validate the state completion value before saving. Thanks to Manoel Brunnen
(@mbru) for the report and debug information. GitLab issue #557.
* Add the MIMETYPE_FILE_READ_SIZE setting to limit the number of bytes read
to determine the MIME type of a new document.
3.1.9 (2018-11-01)
==================

View File

@@ -80,6 +80,21 @@ don't contain at least one version, and don't contain a single page. All these
are requirements for a valid document in Mayan EDMS.
Memory usage
^^^^^^^^^^^^
The MIMETYPE_FILE_READ_SIZE setting was added to limit the number of bytes that
will be read into memory to determine the MIME type of a new document. For
compatibility with the current bevahor this setting defaults to 0 which means
that it is disabled. Disabling the setting will cause the entire document's
file to be loaded into memory. If documents are not processing due to out of
memory errors (large documents or devices with limited memory), set
MIMETYPE_FILE_READ_SIZE to a value other than 0. Limited tests suggest 1024
to be a good alternative as most "magic numbers" used for MIME type detection
are located at the start of the file and just reading the first 1024 bytes will
result in a positive identification with little memory usage.
Other changes
^^^^^^^^^^^^^

View File

@@ -2,24 +2,33 @@ from __future__ import unicode_literals
import magic
from .settings import setting_file_read_size
def get_mimetype(file_object, mimetype_only=False):
"""
Determine a file's mimetype by calling the system's libmagic
library via python-magic or fallback to use python's mimetypes
library
library via python-magic.
"""
file_mimetype = None
file_mime_encoding = None
read_size = setting_file_read_size.value
if read_size == 0:
# If the setting value is 0 that means disable read limit. To disable
# the read limit passing None won't work, we pass -1 instead as per
# the Python documentation.
# https://docs.python.org/2/tutorial/inputoutput.html#methods-of-file-objects
read_size = -1
mime = magic.Magic(mime=True)
file_mimetype = mime.from_buffer(file_object.read())
file_mimetype = mime.from_buffer(file_object.read(read_size))
file_object.seek(0)
if not mimetype_only:
file_object.seek(0)
mime_encoding = magic.Magic(mime_encoding=True)
file_mime_encoding = mime_encoding.from_buffer(file_object.read())
file_mime_encoding = mime_encoding.from_buffer(file_object.read(read_size))
file_object.seek(0)
return file_mimetype, file_mime_encoding

View File

@@ -9,6 +9,7 @@ from .licenses import * # NOQA
class MIMETypesApp(MayanAppConfig):
name = 'mimetype'
has_tests = True
verbose_name = _('MIME types')
def ready(self, *args, **kwargs):

View File

@@ -0,0 +1,15 @@
from __future__ import unicode_literals
from django.utils.translation import ugettext_lazy as _
from smart_settings import Namespace
namespace = Namespace(label=_('MIME type'), name='mimetype')
setting_file_read_size = namespace.add_setting(
default=0, global_name='MIMETYPE_FILE_READ_SIZE', help_text=_(
'Amount of bytes to read from a document to determine its MIME type. '
'Setting it to 0 disables the feature and attempts to read the entire '
'document file into memory.'
)
)

View File

View File

@@ -0,0 +1,38 @@
from __future__ import unicode_literals
import resource
from django.test import override_settings
from common.tests import BaseTestCase
from documents.models import Document
from documents.tests import DocumentTestMixin, TEST_DOCUMENT_FILENAME
# This constant may need tweaking as document upload code path changes.
# The value is targeted at making the document upload process fail exactly
# during the MIME type detection phase. Different architectures may need
# different values.
MAXIMUM_HEAP_MEMORY = 140000000
@override_settings(OCR_AUTO_OCR=False)
@override_settings(DOCUMENT_PARSING_AUTO_PARSING=False)
class MIMETypeTestCase(DocumentTestMixin, BaseTestCase):
auto_upload_document = False
test_document_filename = TEST_DOCUMENT_FILENAME
def setUp(self):
super(MIMETypeTestCase, self).setUp()
resource.setrlimit(resource.RLIMIT_DATA, (MAXIMUM_HEAP_MEMORY, -1))
def test_little_memory_full_file(self):
with self.assertRaises(Exception):
self.upload_document()
self.assertEqual(Document.objects.count(), 0)
@override_settings(MIMETYPE_FILE_READ_SIZE=1024)
def test_little_memory_partial_file(self):
self.upload_document()
self.assertEqual(Document.objects.count(), 1)