diff --git a/HISTORY.rst b/HISTORY.rst index 9cc9ca28e0..2dae27dc86 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -32,6 +32,9 @@ information. * Validate the state completion value before saving. Thanks to Manoel Brunnen (@mbru) for the report and debug information. GitLab issue #557. +* Add the MIMETYPE_FILE_READ_SIZE setting to limit the number of bytes read + to determine the MIME type of a new document. + 3.1.9 (2018-11-01) ================== diff --git a/docs/releases/3.1.10.rst b/docs/releases/3.1.10.rst index 1b23b40c4d..49aadf5717 100644 --- a/docs/releases/3.1.10.rst +++ b/docs/releases/3.1.10.rst @@ -80,6 +80,21 @@ don't contain at least one version, and don't contain a single page. All these are requirements for a valid document in Mayan EDMS. +Memory usage +^^^^^^^^^^^^ + +The MIMETYPE_FILE_READ_SIZE setting was added to limit the number of bytes that +will be read into memory to determine the MIME type of a new document. For +compatibility with the current bevahor this setting defaults to 0 which means +that it is disabled. Disabling the setting will cause the entire document's +file to be loaded into memory. If documents are not processing due to out of +memory errors (large documents or devices with limited memory), set +MIMETYPE_FILE_READ_SIZE to a value other than 0. Limited tests suggest 1024 +to be a good alternative as most "magic numbers" used for MIME type detection +are located at the start of the file and just reading the first 1024 bytes will +result in a positive identification with little memory usage. + + Other changes ^^^^^^^^^^^^^ diff --git a/mayan/apps/mimetype/api.py b/mayan/apps/mimetype/api.py index f0d0001d91..f6e8555b9e 100644 --- a/mayan/apps/mimetype/api.py +++ b/mayan/apps/mimetype/api.py @@ -2,24 +2,33 @@ from __future__ import unicode_literals import magic +from .settings import setting_file_read_size + def get_mimetype(file_object, mimetype_only=False): """ Determine a file's mimetype by calling the system's libmagic - library via python-magic or fallback to use python's mimetypes - library + library via python-magic. """ file_mimetype = None file_mime_encoding = None + read_size = setting_file_read_size.value + if read_size == 0: + # If the setting value is 0 that means disable read limit. To disable + # the read limit passing None won't work, we pass -1 instead as per + # the Python documentation. + # https://docs.python.org/2/tutorial/inputoutput.html#methods-of-file-objects + read_size = -1 + mime = magic.Magic(mime=True) - file_mimetype = mime.from_buffer(file_object.read()) + file_mimetype = mime.from_buffer(file_object.read(read_size)) file_object.seek(0) if not mimetype_only: file_object.seek(0) mime_encoding = magic.Magic(mime_encoding=True) - file_mime_encoding = mime_encoding.from_buffer(file_object.read()) + file_mime_encoding = mime_encoding.from_buffer(file_object.read(read_size)) file_object.seek(0) return file_mimetype, file_mime_encoding diff --git a/mayan/apps/mimetype/apps.py b/mayan/apps/mimetype/apps.py index 9aedf1a23d..ebbbf4b39d 100644 --- a/mayan/apps/mimetype/apps.py +++ b/mayan/apps/mimetype/apps.py @@ -9,6 +9,7 @@ from .licenses import * # NOQA class MIMETypesApp(MayanAppConfig): name = 'mimetype' + has_tests = True verbose_name = _('MIME types') def ready(self, *args, **kwargs): diff --git a/mayan/apps/mimetype/settings.py b/mayan/apps/mimetype/settings.py new file mode 100644 index 0000000000..db312dc71d --- /dev/null +++ b/mayan/apps/mimetype/settings.py @@ -0,0 +1,15 @@ +from __future__ import unicode_literals + +from django.utils.translation import ugettext_lazy as _ + +from smart_settings import Namespace + +namespace = Namespace(label=_('MIME type'), name='mimetype') + +setting_file_read_size = namespace.add_setting( + default=0, global_name='MIMETYPE_FILE_READ_SIZE', help_text=_( + 'Amount of bytes to read from a document to determine its MIME type. ' + 'Setting it to 0 disables the feature and attempts to read the entire ' + 'document file into memory.' + ) +) diff --git a/mayan/apps/mimetype/tests/__init__.py b/mayan/apps/mimetype/tests/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/mayan/apps/mimetype/tests/test_functions.py b/mayan/apps/mimetype/tests/test_functions.py new file mode 100644 index 0000000000..2c3fea150b --- /dev/null +++ b/mayan/apps/mimetype/tests/test_functions.py @@ -0,0 +1,38 @@ +from __future__ import unicode_literals + +import resource + +from django.test import override_settings + +from common.tests import BaseTestCase +from documents.models import Document +from documents.tests import DocumentTestMixin, TEST_DOCUMENT_FILENAME + +# This constant may need tweaking as document upload code path changes. +# The value is targeted at making the document upload process fail exactly +# during the MIME type detection phase. Different architectures may need +# different values. +MAXIMUM_HEAP_MEMORY = 140000000 + + +@override_settings(OCR_AUTO_OCR=False) +@override_settings(DOCUMENT_PARSING_AUTO_PARSING=False) +class MIMETypeTestCase(DocumentTestMixin, BaseTestCase): + auto_upload_document = False + test_document_filename = TEST_DOCUMENT_FILENAME + + def setUp(self): + super(MIMETypeTestCase, self).setUp() + resource.setrlimit(resource.RLIMIT_DATA, (MAXIMUM_HEAP_MEMORY, -1)) + + def test_little_memory_full_file(self): + with self.assertRaises(Exception): + self.upload_document() + + self.assertEqual(Document.objects.count(), 0) + + @override_settings(MIMETYPE_FILE_READ_SIZE=1024) + def test_little_memory_partial_file(self): + self.upload_document() + + self.assertEqual(Document.objects.count(), 1)