Use Mailgun's flanker library to process the email sources.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2018-04-10 01:16:53 -04:00
parent 3909481205
commit 63a77d0235
8 changed files with 244 additions and 120 deletions

View File

@@ -146,6 +146,7 @@
- Improve rendering of the details form.
- Update rendering of the readonly multiselect widget to conform to Django's updated field class interface.
- Add warning when using SQLite as the database backend.
- Use Mailgun's flanker library to process the email sources.
2.7.3 (2017-09-11)
==================

View File

@@ -349,6 +349,15 @@ SQLite. The results are duplicated documents, frequency database locked errors,
among other issues. Suggested database backends are PostgreSQL and MySQL
(or MariaDB) using a transaction aware storage engine like InnoDB.
Received email processing
-------------------------
Parsing email messages is a complex task. To increase compatibility with the
many interpretations of the standards that govern email messaging, Mayan EDMS
now uses Mailgun's flanker library (https://github.com/mailgun/flanker).
Thanks to flanker, Mayan EDMS now gains new capabilities when it comes to
parsing incoming email. For example, in addition to mail attachments, it is now
possible to process files included in emails as inline content.
Other changes worth mentioning
------------------------------
- Add Makefile target to check the format of the README.rst file.

View File

@@ -29,14 +29,6 @@ class SourceUploadedFile(File):
self.extra_data = extra_data
class Attachment(File):
def __init__(self, part, name):
self.name = name
self.file = PseudoFile(
BytesIO(part.get_payload(decode=True)), name=name
)
@python_2_unicode_compatible
class StagingFile(object):
"""

View File

@@ -9,6 +9,7 @@ import os
import poplib
import subprocess
from flanker import mime
import sh
import yaml
@@ -21,7 +22,9 @@ from django.core.exceptions import ValidationError
from django.core.files import File
from django.core.files.base import ContentFile
from django.db import models, transaction
from django.utils.encoding import force_text, python_2_unicode_compatible
from django.utils.encoding import (
force_str, force_text, python_2_unicode_compatible
)
from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _
@@ -37,7 +40,7 @@ from documents.settings import setting_language
from metadata.api import set_bulk_metadata
from metadata.models import MetadataType
from .classes import Attachment, PseudoFile, SourceUploadedFile, StagingFile
from .classes import PseudoFile, SourceUploadedFile, StagingFile
from .exceptions import SourceException
from .literals import (
DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX,
@@ -573,11 +576,7 @@ class EmailBaseModel(IntervalBaseModel):
return ''.join(header_sections)
@staticmethod
def process_message(source, message):
counter = 1
email = message_from_string(message)
metadata_dictionary = {}
def process_message(source, message_text):
if source.subject_metadata_type:
metadata_dictionary[
source.subject_metadata_type.name
@@ -588,29 +587,22 @@ class EmailBaseModel(IntervalBaseModel):
source.from_metadata_type.name
] = EmailBaseModel.getheader(email['From'])
for part in email.walk():
disposition = part.get('Content-Disposition', 'none')
logger.debug('Disposition: %s', disposition)
counter = 1
metadata_dictionary = {}
if disposition.startswith('attachment'):
raw_filename = part.get_filename()
message = mime.from_string(force_str(message_text))
if raw_filename:
filename = collapse_rfc2231_value(raw_filename)
# Decode base64 encoded filename
# https://stackoverflow.com/a/21859258/1364435
if decode_header(filename)[0][1] is not None:
filename = str(decode_header(filename)[0][0]).decode(decode_header(filename)[0][1])
else:
filename = _('attachment-%i') % counter
counter += 1
logger.debug('filename: %s', filename)
with Attachment(part, name=filename) as file_object:
if filename == source.metadata_attachment_name:
# Messages are tree based, do nested processing of message parts until
# a message with no children is found, then work out way up.
if message.parts:
for part in message.parts:
EmailBaseModel.process_message(source=source, message_text=part.to_string())
else:
# Treat inlines as attachments, both are extracted and saved as
# documents
if message.is_attachment() or message.is_inline():
with ContentFile(content=message.body, name=message.detected_file_name) as file_object:
if message.detected_file_name == source.metadata_attachment_name:
metadata_dictionary = yaml.safe_load(
file_object.read()
)
@@ -620,7 +612,7 @@ class EmailBaseModel(IntervalBaseModel):
else:
document = source.handle_upload(
document_type=source.document_type,
file_object=file_object, label=filename,
file_object=file_object, label=message.detected_file_name,
expand=(
source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y
)
@@ -631,25 +623,24 @@ class EmailBaseModel(IntervalBaseModel):
metadata_dictionary=metadata_dictionary
)
else:
logger.debug('No Content-Disposition')
# If it is not an attachment then it should be a body message part.
# Another option is to use message.is_body()
if message.detected_content_type == 'text/html':
label = 'email_body.html'
else:
label = 'email_body.txt'
content_type = part.get_content_type()
logger.debug('content_type: %s', content_type)
if content_type == 'text/plain' and source.store_body:
content = part.get_payload(decode=True).decode(part.get_content_charset())
with ContentFile(content=content, name='email_body.txt') as file_object:
document = source.handle_upload(
document_type=source.document_type,
file_object=file_object,
expand=SOURCE_UNCOMPRESS_CHOICE_N, label='email_body.txt',
with ContentFile(content=message.body, name=label) as file_object:
document = source.handle_upload(
document_type=source.document_type,
file_object=file_object,
expand=SOURCE_UNCOMPRESS_CHOICE_N
)
if metadata_dictionary:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
if metadata_dictionary:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
class POP3Email(EmailBaseModel):
@@ -692,7 +683,7 @@ class POP3Email(EmailBaseModel):
complete_message = '\n'.join(mailbox.retr(message_number)[1])
EmailBaseModel.process_message(
source=self, message=complete_message
source=self, message_text=complete_message
)
mailbox.dele(message_number)
@@ -737,7 +728,7 @@ class IMAPEmail(EmailBaseModel):
logger.debug('message_number: %s', message_number)
status, data = mailbox.fetch(message_number, '(RFC822)')
EmailBaseModel.process_message(
source=self, message=data[0][1]
source=self, message_text=data[0][1]
)
mailbox.store(message_number, '+FLAGS', '\\Deleted')

View File

@@ -1,5 +1,113 @@
from __future__ import unicode_literals
TEST_EMAIL_ATTACHMENT_AND_INLINE='''Subject: Test 03: inline and attachments
To: Renat Gilmanov
Content-Type: multipart/mixed; boundary=001a11c24d809f1525051712cc78
--001a11c24d809f1525051712cc78
Content-Type: multipart/related; boundary=001a11c24d809f1523051712cc77
--001a11c24d809f1523051712cc77
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">Lorem ipsum dolor sit amet, consectetur adipiscing elit. P=
ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu=
s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn=
a id pulvinar.<br><br><img src=3D"cid:ii_ia6yyemg0_14d9636d8ac7a587" height=
=3D"218" width=3D"320"><br>=E2=80=8B<br>Cras eu velit ac purus feugiat impe=
rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve=
lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante=
luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate=
, et semper tellus condimentum.<br><br>Best regards</div>
--001a11c24d809f1523051712cc77
Content-Type: image/png; name="test-01.png"
Content-Disposition: inline; filename="test-01.png"
Content-Transfer-Encoding: base64
Content-ID: <ii_ia6yyemg0_14d9636d8ac7a587>
X-Attachment-Id: ii_ia6yyemg0_14d9636d8ac7a587
iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz
AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB
...
QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+
4gU280YtuwAAAABJRU5ErkJggg==
--001a11c24d809f1523051712cc77--
--001a11c24d809f1525051712cc78
Content-Type: image/png; name="test-02.png"
Content-Disposition: attachment; filename="test-02.png"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_ia6yymei1'''
TEST_EMAIL_BASE64_FILENAME = '''From: noreply@example.com
To: test@example.com
Subject: Scan to E-mail Server Job
Date: Tue, 23 May 2017 23:03:37 +0200
Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76>
Mime-Version: 1.0
Content-Type: multipart/mixed;
boundary="RS1tYWlsIENsaWVudA=="
X-Mailer: E-mail Client
This is multipart message.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: quoted-printable
Sending device cannot receive e-mail replies.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?="
SGFsbG8gQW1wZWxtw6RubmNoZW4hCg==
--RS1tYWlsIENsaWVudA==--'''
TEST_EMAIL_NO_CONTENT_TYPE = '''MIME-Version: 1.0
Received: by 10.0.0.1 with HTTP; Mon, 9 Apr 2018 00:00:00 -0400 (AST)
X-Originating-IP: [10.0.0.1]
Date: Mon, 9 Apr 2018 0:00:0 -0400
Delivered-To: test-sender@example.com
Message-ID: <CAEAsyCbSF1Bk7CBuu6zp3Qs8=j2iUkNi3dPkGe6z40q4dmaogQ@mail.gmail.com>
Subject: Test message with no content type
From: Test Sender <test-sender@example.com>
To: test-receiver@example.com
Test email without a content type'''
TEST_EMAIL_NO_CONTENT_TYPE_STRING = 'Test email without a content type'
TEST_EMAIL_INLINE_IMAGE = '''Subject: Test 01: inline only
To: Renat Gilmanov
Content-Type: multipart/related; boundary=089e0149bb0ea4e55c051712afb5
--089e0149bb0ea4e55c051712afb5
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">Lorem ipsum dolor sit amet, consectetur adipiscing elit. P=
ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu=
s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn=
a id pulvinar.<br><br><img src=3D"cid:ii_ia6yo3z92_14d962f8450cc6f1" height=
=3D"218" width=3D"320"><br>=E2=80=8B<br>Cras eu velit ac purus feugiat impe=
rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve=
lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante=
luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate=
, et semper tellus condimentum.<br><br>Best regards<br></div>
--089e0149bb0ea4e55c051712afb5
Content-Type: image/png; name="test-01.png"
Content-Disposition: inline; filename="test-01.png"
Content-Transfer-Encoding: base64
Content-ID: <ii_ia6yo3z92_14d962f8450cc6f1>
X-Attachment-Id: ii_ia6yo3z92_14d962f8450cc6f1
iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz
AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB
...
QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+
4gU280YtuwAAAABJRU5ErkJggg==
--089e0149bb0ea4e55c051712afb5--'''
TEST_SOURCE_LABEL = 'test source'
TEST_SOURCE_UNCOMPRESS_N = 'n'
TEST_STAGING_PREVIEW_WIDTH = 640

View File

@@ -16,6 +16,12 @@ from documents.tests import (
from ..literals import SOURCE_UNCOMPRESS_CHOICE_Y
from ..models import WatchFolderSource, WebFormSource, EmailBaseModel
from .literals import (
TEST_EMAIL_ATTACHMENT_AND_INLINE, TEST_EMAIL_BASE64_FILENAME,
TEST_EMAIL_INLINE_IMAGE, TEST_EMAIL_NO_CONTENT_TYPE,
TEST_EMAIL_NO_CONTENT_TYPE_STRING
)
@override_settings(OCR_AUTO_OCR=False)
class UploadDocumentTestCase(BaseTestCase):
@@ -119,53 +125,68 @@ class CompressedUploadsTestCase(BaseTestCase):
)
test_email = """From: noreply@example.com
To: test@example.com
Subject: Scan to E-mail Server Job
Date: Tue, 23 May 2017 23:03:37 +0200
Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76>
Mime-Version: 1.0
Content-Type: multipart/mixed;
boundary="RS1tYWlsIENsaWVudA=="
X-Mailer: E-mail Client
This is multipart message.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: quoted-printable
Sending device cannot receive e-mail replies.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?="
SGFsbG8gQW1wZWxtw6RubmNoZW4hCg==
--RS1tYWlsIENsaWVudA==--"""
class SourceStub():
subject_metadata_type = None
from_metadata_type = None
metadata_attachment_name = None
document_type = None
uncompress = None
store_body = False
label = ""
def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None,
metadata_dict_list=None, metadata_dictionary=None, tag_ids=None, user=None):
self.label = label
@override_settings(OCR_AUTO_OCR=False)
class EmailFilenameDecodingTestCase(BaseTestCase):
"""
Test decoding of base64 encoded e-mail attachment filename.
"""
def setUp(self):
super(EmailFilenameDecodingTestCase, self).setUp()
self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
def test_decode_email_encoded_filename(self):
source_stub = SourceStub()
EmailBaseModel.process_message(source_stub, test_email)
self.assertEqual(source_stub.label, u'Ampelm\xe4nnchen.txt')
def tearDown(self):
self.document_type.delete()
super(EmailFilenameDecodingTestCase, self).tearDown()
def _create_email_source(self):
self.source = EmailBaseModel(
document_type=self.document_type,
host='', username='', password='', store_body=True
)
def test_decode_email_base64_encoded_filename(self):
"""
Test decoding of base64 encoded e-mail attachment filename.
"""
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_BASE64_FILENAME
)
self.assertEqual(
Document.objects.first().label, 'Ampelm\xe4nnchen.txt'
)
def test_decode_email_no_content_type(self):
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_NO_CONTENT_TYPE
)
self.assertTrue(
TEST_EMAIL_NO_CONTENT_TYPE_STRING in Document.objects.first().open().read()
)
def test_decode_email_with_inline_image(self):
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_INLINE_IMAGE
)
self.assertTrue(Document.objects.count(), 2)
self.assertQuerysetEqual(
ordered=False, qs=Document.objects.all(), values=(
'<Document: test-01.png>', '<Document: email_body.html>'
),
)
def test_decode_email_with_attachment_and_inline_image(self):
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_ATTACHMENT_AND_INLINE
)
self.assertTrue(Document.objects.count(), 2)
self.assertQuerysetEqual(
ordered=False, qs=Document.objects.all(), values=(
'<Document: test-01.png>', '<Document: email_body.html>',
'<Document: test-02.png>'
),
)

View File

@@ -24,6 +24,7 @@ djangorestframework==3.7.7
djangorestframework-recursive==0.1.2
drf-yasg==1.5.0
flanker==0.8.5
flex==6.12.0
furl==1.0.1
fusepy==2.0.4

View File

@@ -61,40 +61,41 @@ Pillow==5.0.0
PyYAML==3.12
celery==3.1.24
cssmin==0.2.0
django-activity-stream==0.6.3
django-activity-stream==0.6.5
django-autoadmin==1.1.1
django-celery==3.2.1
django-colorful==1.2
django-compressor==2.1
django-cors-headers==1.2.2
django-compressor==2.2
django-cors-headers==2.2.0
django-downloadview==1.9
django-formtools==2.0
django-formtools==2.1
django-pure-pagination==0.3.0
django-mathfilters==0.4.0
django-model-utils==2.6.1
django-mptt>=0.8.7
django-qsstats-magic==0.7.2
django-rest-swagger==0.3.10
django-stronghold==0.2.8
django-suit==0.2.25
django-model-utils==3.1.1
django-mptt==0.9.0
django-qsstats-magic==1.0.0
django-stronghold==0.3.0
django-suit==0.2.26
django-widget-tweaks==1.4.1
djangorestframework==3.7.7
djangorestframework-recursive==0.1.2
drf-yasg==1.5.0
flanker==0.8.5
flex==6.12.0
furl==1.0.1
fusepy==2.0.4
graphviz==0.8
graphviz==0.8.2
mock==2.0.0
pycountry==1.20
PyPDF2==1.26.0
pyocr==0.4.5
python-dateutil==2.5.3
pyocr==0.5.1
python-dateutil==2.6.1
python-gnupg==0.3.9
python-magic==0.4.13
pytz==2016.7
python-magic==0.4.15
pytz==2018.3
requests==2.18.4
sh==1.12.11
sh==1.12.14
swagger-spec-validator==2.1.0
""".split()
with open('README.rst') as f: