diff --git a/HISTORY.rst b/HISTORY.rst index a8a024b7ff..9400d6e27d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -146,6 +146,7 @@ - Improve rendering of the details form. - Update rendering of the readonly multiselect widget to conform to Django's updated field class interface. - Add warning when using SQLite as the database backend. +- Use Mailgun's flanker library to process the email sources. 2.7.3 (2017-09-11) ================== diff --git a/docs/releases/3.0.rst b/docs/releases/3.0.rst index e6d1a0986c..dfead2a67b 100644 --- a/docs/releases/3.0.rst +++ b/docs/releases/3.0.rst @@ -349,6 +349,15 @@ SQLite. The results are duplicated documents, frequency database locked errors, among other issues. Suggested database backends are PostgreSQL and MySQL (or MariaDB) using a transaction aware storage engine like InnoDB. +Received email processing +------------------------- +Parsing email messages is a complex task. To increase compatibility with the +many interpretations of the standards that govern email messaging, Mayan EDMS +now uses Mailgun's flanker library (https://github.com/mailgun/flanker). +Thanks to flanker, Mayan EDMS now gains new capabilities when it comes to +parsing incoming email. For example, in addition to mail attachments, it is now +possible to process files included in emails as inline content. + Other changes worth mentioning ------------------------------ - Add Makefile target to check the format of the README.rst file. diff --git a/mayan/apps/sources/classes.py b/mayan/apps/sources/classes.py index 4b9ccc352c..3fcf32e4c2 100644 --- a/mayan/apps/sources/classes.py +++ b/mayan/apps/sources/classes.py @@ -29,14 +29,6 @@ class SourceUploadedFile(File): self.extra_data = extra_data -class Attachment(File): - def __init__(self, part, name): - self.name = name - self.file = PseudoFile( - BytesIO(part.get_payload(decode=True)), name=name - ) - - @python_2_unicode_compatible class StagingFile(object): """ diff --git a/mayan/apps/sources/models.py b/mayan/apps/sources/models.py index 21b6ecc710..3b029bfd4c 100644 --- a/mayan/apps/sources/models.py +++ b/mayan/apps/sources/models.py @@ -9,6 +9,7 @@ import os import poplib import subprocess +from flanker import mime import sh import yaml @@ -21,7 +22,9 @@ from django.core.exceptions import ValidationError from django.core.files import File from django.core.files.base import ContentFile from django.db import models, transaction -from django.utils.encoding import force_text, python_2_unicode_compatible +from django.utils.encoding import ( + force_str, force_text, python_2_unicode_compatible +) from django.utils.timezone import now from django.utils.translation import ugettext_lazy as _ @@ -37,7 +40,7 @@ from documents.settings import setting_language from metadata.api import set_bulk_metadata from metadata.models import MetadataType -from .classes import Attachment, PseudoFile, SourceUploadedFile, StagingFile +from .classes import PseudoFile, SourceUploadedFile, StagingFile from .exceptions import SourceException from .literals import ( DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX, @@ -573,11 +576,7 @@ class EmailBaseModel(IntervalBaseModel): return ''.join(header_sections) @staticmethod - def process_message(source, message): - counter = 1 - email = message_from_string(message) - metadata_dictionary = {} - + def process_message(source, message_text): if source.subject_metadata_type: metadata_dictionary[ source.subject_metadata_type.name @@ -588,29 +587,22 @@ class EmailBaseModel(IntervalBaseModel): source.from_metadata_type.name ] = EmailBaseModel.getheader(email['From']) - for part in email.walk(): - disposition = part.get('Content-Disposition', 'none') - logger.debug('Disposition: %s', disposition) + counter = 1 + metadata_dictionary = {} - if disposition.startswith('attachment'): - raw_filename = part.get_filename() + message = mime.from_string(force_str(message_text)) - if raw_filename: - filename = collapse_rfc2231_value(raw_filename) - - # Decode base64 encoded filename - # https://stackoverflow.com/a/21859258/1364435 - if decode_header(filename)[0][1] is not None: - filename = str(decode_header(filename)[0][0]).decode(decode_header(filename)[0][1]) - - else: - filename = _('attachment-%i') % counter - counter += 1 - - logger.debug('filename: %s', filename) - - with Attachment(part, name=filename) as file_object: - if filename == source.metadata_attachment_name: + # Messages are tree based, do nested processing of message parts until + # a message with no children is found, then work out way up. + if message.parts: + for part in message.parts: + EmailBaseModel.process_message(source=source, message_text=part.to_string()) + else: + # Treat inlines as attachments, both are extracted and saved as + # documents + if message.is_attachment() or message.is_inline(): + with ContentFile(content=message.body, name=message.detected_file_name) as file_object: + if message.detected_file_name == source.metadata_attachment_name: metadata_dictionary = yaml.safe_load( file_object.read() ) @@ -620,7 +612,7 @@ class EmailBaseModel(IntervalBaseModel): else: document = source.handle_upload( document_type=source.document_type, - file_object=file_object, label=filename, + file_object=file_object, label=message.detected_file_name, expand=( source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y ) @@ -631,25 +623,24 @@ class EmailBaseModel(IntervalBaseModel): metadata_dictionary=metadata_dictionary ) else: - logger.debug('No Content-Disposition') + # If it is not an attachment then it should be a body message part. + # Another option is to use message.is_body() + if message.detected_content_type == 'text/html': + label = 'email_body.html' + else: + label = 'email_body.txt' - content_type = part.get_content_type() - - logger.debug('content_type: %s', content_type) - - if content_type == 'text/plain' and source.store_body: - content = part.get_payload(decode=True).decode(part.get_content_charset()) - with ContentFile(content=content, name='email_body.txt') as file_object: - document = source.handle_upload( - document_type=source.document_type, - file_object=file_object, - expand=SOURCE_UNCOMPRESS_CHOICE_N, label='email_body.txt', + with ContentFile(content=message.body, name=label) as file_object: + document = source.handle_upload( + document_type=source.document_type, + file_object=file_object, + expand=SOURCE_UNCOMPRESS_CHOICE_N + ) + if metadata_dictionary: + set_bulk_metadata( + document=document, + metadata_dictionary=metadata_dictionary ) - if metadata_dictionary: - set_bulk_metadata( - document=document, - metadata_dictionary=metadata_dictionary - ) class POP3Email(EmailBaseModel): @@ -692,7 +683,7 @@ class POP3Email(EmailBaseModel): complete_message = '\n'.join(mailbox.retr(message_number)[1]) EmailBaseModel.process_message( - source=self, message=complete_message + source=self, message_text=complete_message ) mailbox.dele(message_number) @@ -737,7 +728,7 @@ class IMAPEmail(EmailBaseModel): logger.debug('message_number: %s', message_number) status, data = mailbox.fetch(message_number, '(RFC822)') EmailBaseModel.process_message( - source=self, message=data[0][1] + source=self, message_text=data[0][1] ) mailbox.store(message_number, '+FLAGS', '\\Deleted') diff --git a/mayan/apps/sources/tests/literals.py b/mayan/apps/sources/tests/literals.py index 9bc4521ebe..37acf777de 100644 --- a/mayan/apps/sources/tests/literals.py +++ b/mayan/apps/sources/tests/literals.py @@ -1,5 +1,113 @@ from __future__ import unicode_literals +TEST_EMAIL_ATTACHMENT_AND_INLINE='''Subject: Test 03: inline and attachments +To: Renat Gilmanov +Content-Type: multipart/mixed; boundary=001a11c24d809f1525051712cc78 + +--001a11c24d809f1525051712cc78 +Content-Type: multipart/related; boundary=001a11c24d809f1523051712cc77 + +--001a11c24d809f1523051712cc77 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable + +
Lorem ipsum dolor sit amet, consectetur adipiscing elit. P= +ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu= +s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn= +a id pulvinar.


=E2=80=8B
Cras eu velit ac purus feugiat impe= +rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve= +lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante= + luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate= +, et semper tellus condimentum.

Best regards
+ +--001a11c24d809f1523051712cc77 +Content-Type: image/png; name="test-01.png" +Content-Disposition: inline; filename="test-01.png" +Content-Transfer-Encoding: base64 +Content-ID: +X-Attachment-Id: ii_ia6yyemg0_14d9636d8ac7a587 + +iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz +AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB +... +QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+ +4gU280YtuwAAAABJRU5ErkJggg== +--001a11c24d809f1523051712cc77-- +--001a11c24d809f1525051712cc78 +Content-Type: image/png; name="test-02.png" +Content-Disposition: attachment; filename="test-02.png" +Content-Transfer-Encoding: base64 +X-Attachment-Id: f_ia6yymei1''' +TEST_EMAIL_BASE64_FILENAME = '''From: noreply@example.com +To: test@example.com +Subject: Scan to E-mail Server Job +Date: Tue, 23 May 2017 23:03:37 +0200 +Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76> +Mime-Version: 1.0 +Content-Type: multipart/mixed; + boundary="RS1tYWlsIENsaWVudA==" +X-Mailer: E-mail Client + +This is multipart message. + +--RS1tYWlsIENsaWVudA== +Content-Type: text/plain; charset=iso-8859-1 +Content-Transfer-Encoding: quoted-printable + +Sending device cannot receive e-mail replies. +--RS1tYWlsIENsaWVudA== +Content-Type: text/plain +Content-Transfer-Encoding: base64 +Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?=" + +SGFsbG8gQW1wZWxtw6RubmNoZW4hCg== + +--RS1tYWlsIENsaWVudA==--''' + +TEST_EMAIL_NO_CONTENT_TYPE = '''MIME-Version: 1.0 +Received: by 10.0.0.1 with HTTP; Mon, 9 Apr 2018 00:00:00 -0400 (AST) +X-Originating-IP: [10.0.0.1] +Date: Mon, 9 Apr 2018 0:00:0 -0400 +Delivered-To: test-sender@example.com +Message-ID: +Subject: Test message with no content type +From: Test Sender +To: test-receiver@example.com + +Test email without a content type''' +TEST_EMAIL_NO_CONTENT_TYPE_STRING = 'Test email without a content type' +TEST_EMAIL_INLINE_IMAGE = '''Subject: Test 01: inline only +To: Renat Gilmanov +Content-Type: multipart/related; boundary=089e0149bb0ea4e55c051712afb5 + +--089e0149bb0ea4e55c051712afb5 +Content-Type: text/html; charset=UTF-8 +Content-Transfer-Encoding: quoted-printable + +
Lorem ipsum dolor sit amet, consectetur adipiscing elit. P= +ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu= +s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn= +a id pulvinar.


=E2=80=8B
Cras eu velit ac purus feugiat impe= +rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve= +lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante= + luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate= +, et semper tellus condimentum.

Best regards
+ +--089e0149bb0ea4e55c051712afb5 +Content-Type: image/png; name="test-01.png" +Content-Disposition: inline; filename="test-01.png" +Content-Transfer-Encoding: base64 +Content-ID: +X-Attachment-Id: ii_ia6yo3z92_14d962f8450cc6f1 + +iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz +AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB +... +QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+ +4gU280YtuwAAAABJRU5ErkJggg== +--089e0149bb0ea4e55c051712afb5--''' TEST_SOURCE_LABEL = 'test source' TEST_SOURCE_UNCOMPRESS_N = 'n' TEST_STAGING_PREVIEW_WIDTH = 640 diff --git a/mayan/apps/sources/tests/test_models.py b/mayan/apps/sources/tests/test_models.py index 8aa7d70b98..d322e64e01 100644 --- a/mayan/apps/sources/tests/test_models.py +++ b/mayan/apps/sources/tests/test_models.py @@ -16,6 +16,12 @@ from documents.tests import ( from ..literals import SOURCE_UNCOMPRESS_CHOICE_Y from ..models import WatchFolderSource, WebFormSource, EmailBaseModel +from .literals import ( + TEST_EMAIL_ATTACHMENT_AND_INLINE, TEST_EMAIL_BASE64_FILENAME, + TEST_EMAIL_INLINE_IMAGE, TEST_EMAIL_NO_CONTENT_TYPE, + TEST_EMAIL_NO_CONTENT_TYPE_STRING +) + @override_settings(OCR_AUTO_OCR=False) class UploadDocumentTestCase(BaseTestCase): @@ -119,53 +125,68 @@ class CompressedUploadsTestCase(BaseTestCase): ) -test_email = """From: noreply@example.com -To: test@example.com -Subject: Scan to E-mail Server Job -Date: Tue, 23 May 2017 23:03:37 +0200 -Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76> -Mime-Version: 1.0 -Content-Type: multipart/mixed; - boundary="RS1tYWlsIENsaWVudA==" -X-Mailer: E-mail Client - -This is multipart message. - ---RS1tYWlsIENsaWVudA== -Content-Type: text/plain; charset=iso-8859-1 -Content-Transfer-Encoding: quoted-printable - -Sending device cannot receive e-mail replies. ---RS1tYWlsIENsaWVudA== -Content-Type: text/plain -Content-Transfer-Encoding: base64 -Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?=" - -SGFsbG8gQW1wZWxtw6RubmNoZW4hCg== - ---RS1tYWlsIENsaWVudA==--""" - - -class SourceStub(): - subject_metadata_type = None - from_metadata_type = None - metadata_attachment_name = None - document_type = None - uncompress = None - store_body = False - label = "" - - def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None, - metadata_dict_list=None, metadata_dictionary=None, tag_ids=None, user=None): - self.label = label - - +@override_settings(OCR_AUTO_OCR=False) class EmailFilenameDecodingTestCase(BaseTestCase): - """ - Test decoding of base64 encoded e-mail attachment filename. - """ + def setUp(self): + super(EmailFilenameDecodingTestCase, self).setUp() + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) - def test_decode_email_encoded_filename(self): - source_stub = SourceStub() - EmailBaseModel.process_message(source_stub, test_email) - self.assertEqual(source_stub.label, u'Ampelm\xe4nnchen.txt') + def tearDown(self): + self.document_type.delete() + super(EmailFilenameDecodingTestCase, self).tearDown() + + def _create_email_source(self): + self.source = EmailBaseModel( + document_type=self.document_type, + host='', username='', password='', store_body=True + ) + + + def test_decode_email_base64_encoded_filename(self): + """ + Test decoding of base64 encoded e-mail attachment filename. + """ + self._create_email_source() + EmailBaseModel.process_message( + source=self.source, message_text=TEST_EMAIL_BASE64_FILENAME + ) + + self.assertEqual( + Document.objects.first().label, 'Ampelm\xe4nnchen.txt' + ) + + def test_decode_email_no_content_type(self): + self._create_email_source() + EmailBaseModel.process_message( + source=self.source, message_text=TEST_EMAIL_NO_CONTENT_TYPE + ) + self.assertTrue( + TEST_EMAIL_NO_CONTENT_TYPE_STRING in Document.objects.first().open().read() + ) + + def test_decode_email_with_inline_image(self): + self._create_email_source() + EmailBaseModel.process_message( + source=self.source, message_text=TEST_EMAIL_INLINE_IMAGE + ) + self.assertTrue(Document.objects.count(), 2) + self.assertQuerysetEqual( + ordered=False, qs=Document.objects.all(), values=( + '', '' + ), + ) + + def test_decode_email_with_attachment_and_inline_image(self): + self._create_email_source() + EmailBaseModel.process_message( + source=self.source, message_text=TEST_EMAIL_ATTACHMENT_AND_INLINE + ) + self.assertTrue(Document.objects.count(), 2) + self.assertQuerysetEqual( + ordered=False, qs=Document.objects.all(), values=( + '', '', + '' + ), + ) diff --git a/requirements/base.txt b/requirements/base.txt index dda95e2e90..432c454ce3 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -24,6 +24,7 @@ djangorestframework==3.7.7 djangorestframework-recursive==0.1.2 drf-yasg==1.5.0 +flanker==0.8.5 flex==6.12.0 furl==1.0.1 fusepy==2.0.4 diff --git a/setup.py b/setup.py index 567cc0201f..c2a9553ade 100644 --- a/setup.py +++ b/setup.py @@ -61,40 +61,41 @@ Pillow==5.0.0 PyYAML==3.12 celery==3.1.24 cssmin==0.2.0 -django-activity-stream==0.6.3 +django-activity-stream==0.6.5 django-autoadmin==1.1.1 django-celery==3.2.1 django-colorful==1.2 -django-compressor==2.1 -django-cors-headers==1.2.2 +django-compressor==2.2 +django-cors-headers==2.2.0 django-downloadview==1.9 -django-formtools==2.0 +django-formtools==2.1 django-pure-pagination==0.3.0 django-mathfilters==0.4.0 -django-model-utils==2.6.1 -django-mptt>=0.8.7 -django-qsstats-magic==0.7.2 -django-rest-swagger==0.3.10 -django-stronghold==0.2.8 -django-suit==0.2.25 +django-model-utils==3.1.1 +django-mptt==0.9.0 +django-qsstats-magic==1.0.0 +django-stronghold==0.3.0 +django-suit==0.2.26 django-widget-tweaks==1.4.1 djangorestframework==3.7.7 djangorestframework-recursive==0.1.2 drf-yasg==1.5.0 +flanker==0.8.5 flex==6.12.0 furl==1.0.1 fusepy==2.0.4 -graphviz==0.8 +graphviz==0.8.2 mock==2.0.0 pycountry==1.20 PyPDF2==1.26.0 -pyocr==0.4.5 -python-dateutil==2.5.3 +pyocr==0.5.1 +python-dateutil==2.6.1 python-gnupg==0.3.9 -python-magic==0.4.13 -pytz==2016.7 +python-magic==0.4.15 +pytz==2018.3 requests==2.18.4 -sh==1.12.11 +sh==1.12.14 +swagger-spec-validator==2.1.0 """.split() with open('README.rst') as f: