Add the MIMETYPE_FILE_READ_SIZE setting

This new setting is used to limit the number of bytes read while determining the MIME type of a new document. A value of 0 will cause the entire file to be loaded into memory. 1024 appears to be a suitable number for most cases. This setting defaults to 0 to preserve the current behavior but might change in a future version. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-04-03 22:24:19 -04:00
parent 5f877cdc22
commit a56e3ca111
7 changed files with 85 additions and 4 deletions
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -32,6 +32,9 @@
  information.
 * Validate the state completion value before saving. Thanks to Manoel Brunnen
  (@mbru) for the report and debug information. GitLab issue #557.
+* Add the MIMETYPE_FILE_READ_SIZE setting to limit the number of bytes read
+  to determine the MIME type of a new document.
+

 3.1.9 (2018-11-01)
 ==================
--- a/docs/releases/3.1.10.rst
+++ b/docs/releases/3.1.10.rst
@@ -80,6 +80,21 @@ don't contain at least one version, and don't contain a single page. All these
 are requirements for a valid document in Mayan EDMS.


+Memory usage
+^^^^^^^^^^^^
+
+The MIMETYPE_FILE_READ_SIZE setting was added to limit the number of bytes that
+will be read into memory to determine the MIME type of a new document. For
+compatibility with the current bevahor this setting defaults to 0 which means
+that it is disabled. Disabling the setting will cause the entire document's
+file to be loaded into memory. If documents are not processing due to out of
+memory errors (large documents or devices with limited memory), set
+MIMETYPE_FILE_READ_SIZE to a value other than 0. Limited tests suggest 1024
+to be a good alternative as most "magic numbers" used for MIME type detection
+are located at the start of the file and just reading the first 1024 bytes will
+result in a positive identification with little memory usage.
+
+
 Other changes
 ^^^^^^^^^^^^^

--- a/mayan/apps/mimetype/api.py
+++ b/mayan/apps/mimetype/api.py
@@ -2,24 +2,33 @@ from __future__ import unicode_literals

 import magic

+from .settings import setting_file_read_size
+

 def get_mimetype(file_object, mimetype_only=False):
    """
    Determine a file's mimetype by calling the system's libmagic
-    library via python-magic or fallback to use python's mimetypes
-    library
+    library via python-magic.
    """
    file_mimetype = None
    file_mime_encoding = None

+    read_size = setting_file_read_size.value
+    if read_size == 0:
+        # If the setting value is 0 that means disable read limit. To disable
+        # the read limit passing None won't work, we pass -1 instead as per
+        # the Python documentation.
+        # https://docs.python.org/2/tutorial/inputoutput.html#methods-of-file-objects
+        read_size = -1
+
    mime = magic.Magic(mime=True)
-    file_mimetype = mime.from_buffer(file_object.read())
+    file_mimetype = mime.from_buffer(file_object.read(read_size))
    file_object.seek(0)

    if not mimetype_only:
        file_object.seek(0)
        mime_encoding = magic.Magic(mime_encoding=True)
-        file_mime_encoding = mime_encoding.from_buffer(file_object.read())
+        file_mime_encoding = mime_encoding.from_buffer(file_object.read(read_size))
        file_object.seek(0)

    return file_mimetype, file_mime_encoding
--- a/mayan/apps/mimetype/apps.py
+++ b/mayan/apps/mimetype/apps.py
@@ -9,6 +9,7 @@ from .licenses import *  # NOQA

 class MIMETypesApp(MayanAppConfig):
    name = 'mimetype'
+    has_tests = True
    verbose_name = _('MIME types')

    def ready(self, *args, **kwargs):
--- a/mayan/apps/mimetype/settings.py
+++ b/mayan/apps/mimetype/settings.py
@@ -0,0 +1,15 @@
+from __future__ import unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from smart_settings import Namespace
+
+namespace = Namespace(label=_('MIME type'), name='mimetype')
+
+setting_file_read_size = namespace.add_setting(
+    default=0, global_name='MIMETYPE_FILE_READ_SIZE', help_text=_(
+        'Amount of bytes to read from a document to determine its MIME type. '
+        'Setting it to 0 disables the feature and attempts to read the entire '
+        'document file into memory.'
+    )
+)
--- a/mayan/apps/mimetype/tests/init.py
+++ b/mayan/apps/mimetype/tests/init.py
--- a/mayan/apps/mimetype/tests/test_functions.py
+++ b/mayan/apps/mimetype/tests/test_functions.py
@@ -0,0 +1,38 @@
+from __future__ import unicode_literals
+
+import resource
+
+from django.test import override_settings
+
+from common.tests import BaseTestCase
+from documents.models import Document
+from documents.tests import DocumentTestMixin, TEST_DOCUMENT_FILENAME
+
+# This constant may need tweaking as document upload code path changes.
+# The value is targeted at making the document upload process fail exactly
+# during the MIME type detection phase. Different architectures may need
+# different values.
+MAXIMUM_HEAP_MEMORY = 140000000
+
+
+@override_settings(OCR_AUTO_OCR=False)
+@override_settings(DOCUMENT_PARSING_AUTO_PARSING=False)
+class MIMETypeTestCase(DocumentTestMixin, BaseTestCase):
+    auto_upload_document = False
+    test_document_filename = TEST_DOCUMENT_FILENAME
+
+    def setUp(self):
+        super(MIMETypeTestCase, self).setUp()
+        resource.setrlimit(resource.RLIMIT_DATA, (MAXIMUM_HEAP_MEMORY, -1))
+
+    def test_little_memory_full_file(self):
+        with self.assertRaises(Exception):
+            self.upload_document()
+
+        self.assertEqual(Document.objects.count(), 0)
+
+    @override_settings(MIMETYPE_FILE_READ_SIZE=1024)
+    def test_little_memory_partial_file(self):
+        self.upload_document()
+
+        self.assertEqual(Document.objects.count(), 1)