diff --git a/HISTORY.rst b/HISTORY.rst
index a8a024b7ff..9400d6e27d 100644
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -146,6 +146,7 @@
- Improve rendering of the details form.
- Update rendering of the readonly multiselect widget to conform to Django's updated field class interface.
- Add warning when using SQLite as the database backend.
+- Use Mailgun's flanker library to process the email sources.
2.7.3 (2017-09-11)
==================
diff --git a/docs/releases/3.0.rst b/docs/releases/3.0.rst
index e6d1a0986c..dfead2a67b 100644
--- a/docs/releases/3.0.rst
+++ b/docs/releases/3.0.rst
@@ -349,6 +349,15 @@ SQLite. The results are duplicated documents, frequency database locked errors,
among other issues. Suggested database backends are PostgreSQL and MySQL
(or MariaDB) using a transaction aware storage engine like InnoDB.
+Received email processing
+-------------------------
+Parsing email messages is a complex task. To increase compatibility with the
+many interpretations of the standards that govern email messaging, Mayan EDMS
+now uses Mailgun's flanker library (https://github.com/mailgun/flanker).
+Thanks to flanker, Mayan EDMS now gains new capabilities when it comes to
+parsing incoming email. For example, in addition to mail attachments, it is now
+possible to process files included in emails as inline content.
+
Other changes worth mentioning
------------------------------
- Add Makefile target to check the format of the README.rst file.
diff --git a/mayan/apps/sources/classes.py b/mayan/apps/sources/classes.py
index 4b9ccc352c..3fcf32e4c2 100644
--- a/mayan/apps/sources/classes.py
+++ b/mayan/apps/sources/classes.py
@@ -29,14 +29,6 @@ class SourceUploadedFile(File):
self.extra_data = extra_data
-class Attachment(File):
- def __init__(self, part, name):
- self.name = name
- self.file = PseudoFile(
- BytesIO(part.get_payload(decode=True)), name=name
- )
-
-
@python_2_unicode_compatible
class StagingFile(object):
"""
diff --git a/mayan/apps/sources/models.py b/mayan/apps/sources/models.py
index 21b6ecc710..3b029bfd4c 100644
--- a/mayan/apps/sources/models.py
+++ b/mayan/apps/sources/models.py
@@ -9,6 +9,7 @@ import os
import poplib
import subprocess
+from flanker import mime
import sh
import yaml
@@ -21,7 +22,9 @@ from django.core.exceptions import ValidationError
from django.core.files import File
from django.core.files.base import ContentFile
from django.db import models, transaction
-from django.utils.encoding import force_text, python_2_unicode_compatible
+from django.utils.encoding import (
+ force_str, force_text, python_2_unicode_compatible
+)
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
@@ -37,7 +40,7 @@ from documents.settings import setting_language
from metadata.api import set_bulk_metadata
from metadata.models import MetadataType
-from .classes import Attachment, PseudoFile, SourceUploadedFile, StagingFile
+from .classes import PseudoFile, SourceUploadedFile, StagingFile
from .exceptions import SourceException
from .literals import (
DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX,
@@ -573,11 +576,7 @@ class EmailBaseModel(IntervalBaseModel):
return ''.join(header_sections)
@staticmethod
- def process_message(source, message):
- counter = 1
- email = message_from_string(message)
- metadata_dictionary = {}
-
+ def process_message(source, message_text):
if source.subject_metadata_type:
metadata_dictionary[
source.subject_metadata_type.name
@@ -588,29 +587,22 @@ class EmailBaseModel(IntervalBaseModel):
source.from_metadata_type.name
] = EmailBaseModel.getheader(email['From'])
- for part in email.walk():
- disposition = part.get('Content-Disposition', 'none')
- logger.debug('Disposition: %s', disposition)
+ counter = 1
+ metadata_dictionary = {}
- if disposition.startswith('attachment'):
- raw_filename = part.get_filename()
+ message = mime.from_string(force_str(message_text))
- if raw_filename:
- filename = collapse_rfc2231_value(raw_filename)
-
- # Decode base64 encoded filename
- # https://stackoverflow.com/a/21859258/1364435
- if decode_header(filename)[0][1] is not None:
- filename = str(decode_header(filename)[0][0]).decode(decode_header(filename)[0][1])
-
- else:
- filename = _('attachment-%i') % counter
- counter += 1
-
- logger.debug('filename: %s', filename)
-
- with Attachment(part, name=filename) as file_object:
- if filename == source.metadata_attachment_name:
+ # Messages are tree based, do nested processing of message parts until
+ # a message with no children is found, then work out way up.
+ if message.parts:
+ for part in message.parts:
+ EmailBaseModel.process_message(source=source, message_text=part.to_string())
+ else:
+ # Treat inlines as attachments, both are extracted and saved as
+ # documents
+ if message.is_attachment() or message.is_inline():
+ with ContentFile(content=message.body, name=message.detected_file_name) as file_object:
+ if message.detected_file_name == source.metadata_attachment_name:
metadata_dictionary = yaml.safe_load(
file_object.read()
)
@@ -620,7 +612,7 @@ class EmailBaseModel(IntervalBaseModel):
else:
document = source.handle_upload(
document_type=source.document_type,
- file_object=file_object, label=filename,
+ file_object=file_object, label=message.detected_file_name,
expand=(
source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y
)
@@ -631,25 +623,24 @@ class EmailBaseModel(IntervalBaseModel):
metadata_dictionary=metadata_dictionary
)
else:
- logger.debug('No Content-Disposition')
+ # If it is not an attachment then it should be a body message part.
+ # Another option is to use message.is_body()
+ if message.detected_content_type == 'text/html':
+ label = 'email_body.html'
+ else:
+ label = 'email_body.txt'
- content_type = part.get_content_type()
-
- logger.debug('content_type: %s', content_type)
-
- if content_type == 'text/plain' and source.store_body:
- content = part.get_payload(decode=True).decode(part.get_content_charset())
- with ContentFile(content=content, name='email_body.txt') as file_object:
- document = source.handle_upload(
- document_type=source.document_type,
- file_object=file_object,
- expand=SOURCE_UNCOMPRESS_CHOICE_N, label='email_body.txt',
+ with ContentFile(content=message.body, name=label) as file_object:
+ document = source.handle_upload(
+ document_type=source.document_type,
+ file_object=file_object,
+ expand=SOURCE_UNCOMPRESS_CHOICE_N
+ )
+ if metadata_dictionary:
+ set_bulk_metadata(
+ document=document,
+ metadata_dictionary=metadata_dictionary
)
- if metadata_dictionary:
- set_bulk_metadata(
- document=document,
- metadata_dictionary=metadata_dictionary
- )
class POP3Email(EmailBaseModel):
@@ -692,7 +683,7 @@ class POP3Email(EmailBaseModel):
complete_message = '\n'.join(mailbox.retr(message_number)[1])
EmailBaseModel.process_message(
- source=self, message=complete_message
+ source=self, message_text=complete_message
)
mailbox.dele(message_number)
@@ -737,7 +728,7 @@ class IMAPEmail(EmailBaseModel):
logger.debug('message_number: %s', message_number)
status, data = mailbox.fetch(message_number, '(RFC822)')
EmailBaseModel.process_message(
- source=self, message=data[0][1]
+ source=self, message_text=data[0][1]
)
mailbox.store(message_number, '+FLAGS', '\\Deleted')
diff --git a/mayan/apps/sources/tests/literals.py b/mayan/apps/sources/tests/literals.py
index 9bc4521ebe..37acf777de 100644
--- a/mayan/apps/sources/tests/literals.py
+++ b/mayan/apps/sources/tests/literals.py
@@ -1,5 +1,113 @@
from __future__ import unicode_literals
+TEST_EMAIL_ATTACHMENT_AND_INLINE='''Subject: Test 03: inline and attachments
+To: Renat Gilmanov
+Content-Type: multipart/mixed; boundary=001a11c24d809f1525051712cc78
+
+--001a11c24d809f1525051712cc78
+Content-Type: multipart/related; boundary=001a11c24d809f1523051712cc77
+
+--001a11c24d809f1523051712cc77
+Content-Type: text/html; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+
Lorem ipsum dolor sit amet, consectetur adipiscing elit. P=
+ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu=
+s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn=
+a id pulvinar.

=E2=80=8B
Cras eu velit ac purus feugiat impe=
+rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve=
+lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante=
+ luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate=
+, et semper tellus condimentum.
Best regards
+
+--001a11c24d809f1523051712cc77
+Content-Type: image/png; name="test-01.png"
+Content-Disposition: inline; filename="test-01.png"
+Content-Transfer-Encoding: base64
+Content-ID:
+X-Attachment-Id: ii_ia6yyemg0_14d9636d8ac7a587
+
+iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz
+AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB
+...
+QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+
+4gU280YtuwAAAABJRU5ErkJggg==
+--001a11c24d809f1523051712cc77--
+--001a11c24d809f1525051712cc78
+Content-Type: image/png; name="test-02.png"
+Content-Disposition: attachment; filename="test-02.png"
+Content-Transfer-Encoding: base64
+X-Attachment-Id: f_ia6yymei1'''
+TEST_EMAIL_BASE64_FILENAME = '''From: noreply@example.com
+To: test@example.com
+Subject: Scan to E-mail Server Job
+Date: Tue, 23 May 2017 23:03:37 +0200
+Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76>
+Mime-Version: 1.0
+Content-Type: multipart/mixed;
+ boundary="RS1tYWlsIENsaWVudA=="
+X-Mailer: E-mail Client
+
+This is multipart message.
+
+--RS1tYWlsIENsaWVudA==
+Content-Type: text/plain; charset=iso-8859-1
+Content-Transfer-Encoding: quoted-printable
+
+Sending device cannot receive e-mail replies.
+--RS1tYWlsIENsaWVudA==
+Content-Type: text/plain
+Content-Transfer-Encoding: base64
+Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?="
+
+SGFsbG8gQW1wZWxtw6RubmNoZW4hCg==
+
+--RS1tYWlsIENsaWVudA==--'''
+
+TEST_EMAIL_NO_CONTENT_TYPE = '''MIME-Version: 1.0
+Received: by 10.0.0.1 with HTTP; Mon, 9 Apr 2018 00:00:00 -0400 (AST)
+X-Originating-IP: [10.0.0.1]
+Date: Mon, 9 Apr 2018 0:00:0 -0400
+Delivered-To: test-sender@example.com
+Message-ID:
+Subject: Test message with no content type
+From: Test Sender
+To: test-receiver@example.com
+
+Test email without a content type'''
+TEST_EMAIL_NO_CONTENT_TYPE_STRING = 'Test email without a content type'
+TEST_EMAIL_INLINE_IMAGE = '''Subject: Test 01: inline only
+To: Renat Gilmanov
+Content-Type: multipart/related; boundary=089e0149bb0ea4e55c051712afb5
+
+--089e0149bb0ea4e55c051712afb5
+Content-Type: text/html; charset=UTF-8
+Content-Transfer-Encoding: quoted-printable
+
+Lorem ipsum dolor sit amet, consectetur adipiscing elit. P=
+ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu=
+s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn=
+a id pulvinar.

=E2=80=8B
Cras eu velit ac purus feugiat impe=
+rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve=
+lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante=
+ luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate=
+, et semper tellus condimentum.
Best regards
+
+--089e0149bb0ea4e55c051712afb5
+Content-Type: image/png; name="test-01.png"
+Content-Disposition: inline; filename="test-01.png"
+Content-Transfer-Encoding: base64
+Content-ID:
+X-Attachment-Id: ii_ia6yo3z92_14d962f8450cc6f1
+
+iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz
+AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB
+...
+QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+
+4gU280YtuwAAAABJRU5ErkJggg==
+--089e0149bb0ea4e55c051712afb5--'''
TEST_SOURCE_LABEL = 'test source'
TEST_SOURCE_UNCOMPRESS_N = 'n'
TEST_STAGING_PREVIEW_WIDTH = 640
diff --git a/mayan/apps/sources/tests/test_models.py b/mayan/apps/sources/tests/test_models.py
index 8aa7d70b98..d322e64e01 100644
--- a/mayan/apps/sources/tests/test_models.py
+++ b/mayan/apps/sources/tests/test_models.py
@@ -16,6 +16,12 @@ from documents.tests import (
from ..literals import SOURCE_UNCOMPRESS_CHOICE_Y
from ..models import WatchFolderSource, WebFormSource, EmailBaseModel
+from .literals import (
+ TEST_EMAIL_ATTACHMENT_AND_INLINE, TEST_EMAIL_BASE64_FILENAME,
+ TEST_EMAIL_INLINE_IMAGE, TEST_EMAIL_NO_CONTENT_TYPE,
+ TEST_EMAIL_NO_CONTENT_TYPE_STRING
+)
+
@override_settings(OCR_AUTO_OCR=False)
class UploadDocumentTestCase(BaseTestCase):
@@ -119,53 +125,68 @@ class CompressedUploadsTestCase(BaseTestCase):
)
-test_email = """From: noreply@example.com
-To: test@example.com
-Subject: Scan to E-mail Server Job
-Date: Tue, 23 May 2017 23:03:37 +0200
-Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76>
-Mime-Version: 1.0
-Content-Type: multipart/mixed;
- boundary="RS1tYWlsIENsaWVudA=="
-X-Mailer: E-mail Client
-
-This is multipart message.
-
---RS1tYWlsIENsaWVudA==
-Content-Type: text/plain; charset=iso-8859-1
-Content-Transfer-Encoding: quoted-printable
-
-Sending device cannot receive e-mail replies.
---RS1tYWlsIENsaWVudA==
-Content-Type: text/plain
-Content-Transfer-Encoding: base64
-Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?="
-
-SGFsbG8gQW1wZWxtw6RubmNoZW4hCg==
-
---RS1tYWlsIENsaWVudA==--"""
-
-
-class SourceStub():
- subject_metadata_type = None
- from_metadata_type = None
- metadata_attachment_name = None
- document_type = None
- uncompress = None
- store_body = False
- label = ""
-
- def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None,
- metadata_dict_list=None, metadata_dictionary=None, tag_ids=None, user=None):
- self.label = label
-
-
+@override_settings(OCR_AUTO_OCR=False)
class EmailFilenameDecodingTestCase(BaseTestCase):
- """
- Test decoding of base64 encoded e-mail attachment filename.
- """
+ def setUp(self):
+ super(EmailFilenameDecodingTestCase, self).setUp()
+ self.document_type = DocumentType.objects.create(
+ label=TEST_DOCUMENT_TYPE_LABEL
+ )
- def test_decode_email_encoded_filename(self):
- source_stub = SourceStub()
- EmailBaseModel.process_message(source_stub, test_email)
- self.assertEqual(source_stub.label, u'Ampelm\xe4nnchen.txt')
+ def tearDown(self):
+ self.document_type.delete()
+ super(EmailFilenameDecodingTestCase, self).tearDown()
+
+ def _create_email_source(self):
+ self.source = EmailBaseModel(
+ document_type=self.document_type,
+ host='', username='', password='', store_body=True
+ )
+
+
+ def test_decode_email_base64_encoded_filename(self):
+ """
+ Test decoding of base64 encoded e-mail attachment filename.
+ """
+ self._create_email_source()
+ EmailBaseModel.process_message(
+ source=self.source, message_text=TEST_EMAIL_BASE64_FILENAME
+ )
+
+ self.assertEqual(
+ Document.objects.first().label, 'Ampelm\xe4nnchen.txt'
+ )
+
+ def test_decode_email_no_content_type(self):
+ self._create_email_source()
+ EmailBaseModel.process_message(
+ source=self.source, message_text=TEST_EMAIL_NO_CONTENT_TYPE
+ )
+ self.assertTrue(
+ TEST_EMAIL_NO_CONTENT_TYPE_STRING in Document.objects.first().open().read()
+ )
+
+ def test_decode_email_with_inline_image(self):
+ self._create_email_source()
+ EmailBaseModel.process_message(
+ source=self.source, message_text=TEST_EMAIL_INLINE_IMAGE
+ )
+ self.assertTrue(Document.objects.count(), 2)
+ self.assertQuerysetEqual(
+ ordered=False, qs=Document.objects.all(), values=(
+ '', ''
+ ),
+ )
+
+ def test_decode_email_with_attachment_and_inline_image(self):
+ self._create_email_source()
+ EmailBaseModel.process_message(
+ source=self.source, message_text=TEST_EMAIL_ATTACHMENT_AND_INLINE
+ )
+ self.assertTrue(Document.objects.count(), 2)
+ self.assertQuerysetEqual(
+ ordered=False, qs=Document.objects.all(), values=(
+ '', '',
+ ''
+ ),
+ )
diff --git a/requirements/base.txt b/requirements/base.txt
index dda95e2e90..432c454ce3 100644
--- a/requirements/base.txt
+++ b/requirements/base.txt
@@ -24,6 +24,7 @@ djangorestframework==3.7.7
djangorestframework-recursive==0.1.2
drf-yasg==1.5.0
+flanker==0.8.5
flex==6.12.0
furl==1.0.1
fusepy==2.0.4
diff --git a/setup.py b/setup.py
index 567cc0201f..c2a9553ade 100644
--- a/setup.py
+++ b/setup.py
@@ -61,40 +61,41 @@ Pillow==5.0.0
PyYAML==3.12
celery==3.1.24
cssmin==0.2.0
-django-activity-stream==0.6.3
+django-activity-stream==0.6.5
django-autoadmin==1.1.1
django-celery==3.2.1
django-colorful==1.2
-django-compressor==2.1
-django-cors-headers==1.2.2
+django-compressor==2.2
+django-cors-headers==2.2.0
django-downloadview==1.9
-django-formtools==2.0
+django-formtools==2.1
django-pure-pagination==0.3.0
django-mathfilters==0.4.0
-django-model-utils==2.6.1
-django-mptt>=0.8.7
-django-qsstats-magic==0.7.2
-django-rest-swagger==0.3.10
-django-stronghold==0.2.8
-django-suit==0.2.25
+django-model-utils==3.1.1
+django-mptt==0.9.0
+django-qsstats-magic==1.0.0
+django-stronghold==0.3.0
+django-suit==0.2.26
django-widget-tweaks==1.4.1
djangorestframework==3.7.7
djangorestframework-recursive==0.1.2
drf-yasg==1.5.0
+flanker==0.8.5
flex==6.12.0
furl==1.0.1
fusepy==2.0.4
-graphviz==0.8
+graphviz==0.8.2
mock==2.0.0
pycountry==1.20
PyPDF2==1.26.0
-pyocr==0.4.5
-python-dateutil==2.5.3
+pyocr==0.5.1
+python-dateutil==2.6.1
python-gnupg==0.3.9
-python-magic==0.4.13
-pytz==2016.7
+python-magic==0.4.15
+pytz==2018.3
requests==2.18.4
-sh==1.12.11
+sh==1.12.14
+swagger-spec-validator==2.1.0
""".split()
with open('README.rst') as f: