Use Mailgun's flanker library to process the email sources.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
Roberto Rosario
2018-04-10 01:16:53 -04:00
parent 3909481205
commit 63a77d0235
8 changed files with 244 additions and 120 deletions

View File

@@ -146,6 +146,7 @@
- Improve rendering of the details form. - Improve rendering of the details form.
- Update rendering of the readonly multiselect widget to conform to Django's updated field class interface. - Update rendering of the readonly multiselect widget to conform to Django's updated field class interface.
- Add warning when using SQLite as the database backend. - Add warning when using SQLite as the database backend.
- Use Mailgun's flanker library to process the email sources.
2.7.3 (2017-09-11) 2.7.3 (2017-09-11)
================== ==================

View File

@@ -349,6 +349,15 @@ SQLite. The results are duplicated documents, frequency database locked errors,
among other issues. Suggested database backends are PostgreSQL and MySQL among other issues. Suggested database backends are PostgreSQL and MySQL
(or MariaDB) using a transaction aware storage engine like InnoDB. (or MariaDB) using a transaction aware storage engine like InnoDB.
Received email processing
-------------------------
Parsing email messages is a complex task. To increase compatibility with the
many interpretations of the standards that govern email messaging, Mayan EDMS
now uses Mailgun's flanker library (https://github.com/mailgun/flanker).
Thanks to flanker, Mayan EDMS now gains new capabilities when it comes to
parsing incoming email. For example, in addition to mail attachments, it is now
possible to process files included in emails as inline content.
Other changes worth mentioning Other changes worth mentioning
------------------------------ ------------------------------
- Add Makefile target to check the format of the README.rst file. - Add Makefile target to check the format of the README.rst file.

View File

@@ -29,14 +29,6 @@ class SourceUploadedFile(File):
self.extra_data = extra_data self.extra_data = extra_data
class Attachment(File):
def __init__(self, part, name):
self.name = name
self.file = PseudoFile(
BytesIO(part.get_payload(decode=True)), name=name
)
@python_2_unicode_compatible @python_2_unicode_compatible
class StagingFile(object): class StagingFile(object):
""" """

View File

@@ -9,6 +9,7 @@ import os
import poplib import poplib
import subprocess import subprocess
from flanker import mime
import sh import sh
import yaml import yaml
@@ -21,7 +22,9 @@ from django.core.exceptions import ValidationError
from django.core.files import File from django.core.files import File
from django.core.files.base import ContentFile from django.core.files.base import ContentFile
from django.db import models, transaction from django.db import models, transaction
from django.utils.encoding import force_text, python_2_unicode_compatible from django.utils.encoding import (
force_str, force_text, python_2_unicode_compatible
)
from django.utils.timezone import now from django.utils.timezone import now
from django.utils.translation import ugettext_lazy as _ from django.utils.translation import ugettext_lazy as _
@@ -37,7 +40,7 @@ from documents.settings import setting_language
from metadata.api import set_bulk_metadata from metadata.api import set_bulk_metadata
from metadata.models import MetadataType from metadata.models import MetadataType
from .classes import Attachment, PseudoFile, SourceUploadedFile, StagingFile from .classes import PseudoFile, SourceUploadedFile, StagingFile
from .exceptions import SourceException from .exceptions import SourceException
from .literals import ( from .literals import (
DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX, DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX,
@@ -573,11 +576,7 @@ class EmailBaseModel(IntervalBaseModel):
return ''.join(header_sections) return ''.join(header_sections)
@staticmethod @staticmethod
def process_message(source, message): def process_message(source, message_text):
counter = 1
email = message_from_string(message)
metadata_dictionary = {}
if source.subject_metadata_type: if source.subject_metadata_type:
metadata_dictionary[ metadata_dictionary[
source.subject_metadata_type.name source.subject_metadata_type.name
@@ -588,29 +587,22 @@ class EmailBaseModel(IntervalBaseModel):
source.from_metadata_type.name source.from_metadata_type.name
] = EmailBaseModel.getheader(email['From']) ] = EmailBaseModel.getheader(email['From'])
for part in email.walk(): counter = 1
disposition = part.get('Content-Disposition', 'none') metadata_dictionary = {}
logger.debug('Disposition: %s', disposition)
if disposition.startswith('attachment'): message = mime.from_string(force_str(message_text))
raw_filename = part.get_filename()
if raw_filename: # Messages are tree based, do nested processing of message parts until
filename = collapse_rfc2231_value(raw_filename) # a message with no children is found, then work out way up.
if message.parts:
# Decode base64 encoded filename for part in message.parts:
# https://stackoverflow.com/a/21859258/1364435 EmailBaseModel.process_message(source=source, message_text=part.to_string())
if decode_header(filename)[0][1] is not None: else:
filename = str(decode_header(filename)[0][0]).decode(decode_header(filename)[0][1]) # Treat inlines as attachments, both are extracted and saved as
# documents
else: if message.is_attachment() or message.is_inline():
filename = _('attachment-%i') % counter with ContentFile(content=message.body, name=message.detected_file_name) as file_object:
counter += 1 if message.detected_file_name == source.metadata_attachment_name:
logger.debug('filename: %s', filename)
with Attachment(part, name=filename) as file_object:
if filename == source.metadata_attachment_name:
metadata_dictionary = yaml.safe_load( metadata_dictionary = yaml.safe_load(
file_object.read() file_object.read()
) )
@@ -620,7 +612,7 @@ class EmailBaseModel(IntervalBaseModel):
else: else:
document = source.handle_upload( document = source.handle_upload(
document_type=source.document_type, document_type=source.document_type,
file_object=file_object, label=filename, file_object=file_object, label=message.detected_file_name,
expand=( expand=(
source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y
) )
@@ -631,25 +623,24 @@ class EmailBaseModel(IntervalBaseModel):
metadata_dictionary=metadata_dictionary metadata_dictionary=metadata_dictionary
) )
else: else:
logger.debug('No Content-Disposition') # If it is not an attachment then it should be a body message part.
# Another option is to use message.is_body()
if message.detected_content_type == 'text/html':
label = 'email_body.html'
else:
label = 'email_body.txt'
content_type = part.get_content_type() with ContentFile(content=message.body, name=label) as file_object:
document = source.handle_upload(
logger.debug('content_type: %s', content_type) document_type=source.document_type,
file_object=file_object,
if content_type == 'text/plain' and source.store_body: expand=SOURCE_UNCOMPRESS_CHOICE_N
content = part.get_payload(decode=True).decode(part.get_content_charset()) )
with ContentFile(content=content, name='email_body.txt') as file_object: if metadata_dictionary:
document = source.handle_upload( set_bulk_metadata(
document_type=source.document_type, document=document,
file_object=file_object, metadata_dictionary=metadata_dictionary
expand=SOURCE_UNCOMPRESS_CHOICE_N, label='email_body.txt',
) )
if metadata_dictionary:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
class POP3Email(EmailBaseModel): class POP3Email(EmailBaseModel):
@@ -692,7 +683,7 @@ class POP3Email(EmailBaseModel):
complete_message = '\n'.join(mailbox.retr(message_number)[1]) complete_message = '\n'.join(mailbox.retr(message_number)[1])
EmailBaseModel.process_message( EmailBaseModel.process_message(
source=self, message=complete_message source=self, message_text=complete_message
) )
mailbox.dele(message_number) mailbox.dele(message_number)
@@ -737,7 +728,7 @@ class IMAPEmail(EmailBaseModel):
logger.debug('message_number: %s', message_number) logger.debug('message_number: %s', message_number)
status, data = mailbox.fetch(message_number, '(RFC822)') status, data = mailbox.fetch(message_number, '(RFC822)')
EmailBaseModel.process_message( EmailBaseModel.process_message(
source=self, message=data[0][1] source=self, message_text=data[0][1]
) )
mailbox.store(message_number, '+FLAGS', '\\Deleted') mailbox.store(message_number, '+FLAGS', '\\Deleted')

View File

@@ -1,5 +1,113 @@
from __future__ import unicode_literals from __future__ import unicode_literals
TEST_EMAIL_ATTACHMENT_AND_INLINE='''Subject: Test 03: inline and attachments
To: Renat Gilmanov
Content-Type: multipart/mixed; boundary=001a11c24d809f1525051712cc78
--001a11c24d809f1525051712cc78
Content-Type: multipart/related; boundary=001a11c24d809f1523051712cc77
--001a11c24d809f1523051712cc77
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">Lorem ipsum dolor sit amet, consectetur adipiscing elit. P=
ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu=
s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn=
a id pulvinar.<br><br><img src=3D"cid:ii_ia6yyemg0_14d9636d8ac7a587" height=
=3D"218" width=3D"320"><br>=E2=80=8B<br>Cras eu velit ac purus feugiat impe=
rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve=
lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante=
luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate=
, et semper tellus condimentum.<br><br>Best regards</div>
--001a11c24d809f1523051712cc77
Content-Type: image/png; name="test-01.png"
Content-Disposition: inline; filename="test-01.png"
Content-Transfer-Encoding: base64
Content-ID: <ii_ia6yyemg0_14d9636d8ac7a587>
X-Attachment-Id: ii_ia6yyemg0_14d9636d8ac7a587
iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz
AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB
...
QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+
4gU280YtuwAAAABJRU5ErkJggg==
--001a11c24d809f1523051712cc77--
--001a11c24d809f1525051712cc78
Content-Type: image/png; name="test-02.png"
Content-Disposition: attachment; filename="test-02.png"
Content-Transfer-Encoding: base64
X-Attachment-Id: f_ia6yymei1'''
TEST_EMAIL_BASE64_FILENAME = '''From: noreply@example.com
To: test@example.com
Subject: Scan to E-mail Server Job
Date: Tue, 23 May 2017 23:03:37 +0200
Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76>
Mime-Version: 1.0
Content-Type: multipart/mixed;
boundary="RS1tYWlsIENsaWVudA=="
X-Mailer: E-mail Client
This is multipart message.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: quoted-printable
Sending device cannot receive e-mail replies.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?="
SGFsbG8gQW1wZWxtw6RubmNoZW4hCg==
--RS1tYWlsIENsaWVudA==--'''
TEST_EMAIL_NO_CONTENT_TYPE = '''MIME-Version: 1.0
Received: by 10.0.0.1 with HTTP; Mon, 9 Apr 2018 00:00:00 -0400 (AST)
X-Originating-IP: [10.0.0.1]
Date: Mon, 9 Apr 2018 0:00:0 -0400
Delivered-To: test-sender@example.com
Message-ID: <CAEAsyCbSF1Bk7CBuu6zp3Qs8=j2iUkNi3dPkGe6z40q4dmaogQ@mail.gmail.com>
Subject: Test message with no content type
From: Test Sender <test-sender@example.com>
To: test-receiver@example.com
Test email without a content type'''
TEST_EMAIL_NO_CONTENT_TYPE_STRING = 'Test email without a content type'
TEST_EMAIL_INLINE_IMAGE = '''Subject: Test 01: inline only
To: Renat Gilmanov
Content-Type: multipart/related; boundary=089e0149bb0ea4e55c051712afb5
--089e0149bb0ea4e55c051712afb5
Content-Type: text/html; charset=UTF-8
Content-Transfer-Encoding: quoted-printable
<div dir=3D"ltr">Lorem ipsum dolor sit amet, consectetur adipiscing elit. P=
ellentesque odio urna, bibendum eu ultricies in, dignissim in magna. Vivamu=
s risus justo, viverra sed dapibus eu, laoreet eget erat. Sed pretium a urn=
a id pulvinar.<br><br><img src=3D"cid:ii_ia6yo3z92_14d962f8450cc6f1" height=
=3D"218" width=3D"320"><br>=E2=80=8B<br>Cras eu velit ac purus feugiat impe=
rdiet nec sit amet ipsum. Praesent gravida lobortis justo, nec tristique ve=
lit sagittis finibus. Suspendisse porta ante id diam varius, in cursus ante=
luctus. Aenean a mollis mi. Pellentesque accumsan lacus sed erat vulputate=
, et semper tellus condimentum.<br><br>Best regards<br></div>
--089e0149bb0ea4e55c051712afb5
Content-Type: image/png; name="test-01.png"
Content-Disposition: inline; filename="test-01.png"
Content-Transfer-Encoding: base64
Content-ID: <ii_ia6yo3z92_14d962f8450cc6f1>
X-Attachment-Id: ii_ia6yo3z92_14d962f8450cc6f1
iVBORw0KGgoAAAANSUhEUgAAAUAAAADaCAYAAADXGps7AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz
AAALewAAC3sBSRnwgAAAABl0RVh0U29mdHdhcmUAd3d3Lmlua3NjYXBlLm9yZ5vuPBoAAALnSURB
...
QCDLAIEsAwSyDBDIMkAgywCBLAMEsgwQyDJAIMsAgSwDBLIMEMgyQCDLAIEsAwSyDBDIMkAg6wK+
4gU280YtuwAAAABJRU5ErkJggg==
--089e0149bb0ea4e55c051712afb5--'''
TEST_SOURCE_LABEL = 'test source' TEST_SOURCE_LABEL = 'test source'
TEST_SOURCE_UNCOMPRESS_N = 'n' TEST_SOURCE_UNCOMPRESS_N = 'n'
TEST_STAGING_PREVIEW_WIDTH = 640 TEST_STAGING_PREVIEW_WIDTH = 640

View File

@@ -16,6 +16,12 @@ from documents.tests import (
from ..literals import SOURCE_UNCOMPRESS_CHOICE_Y from ..literals import SOURCE_UNCOMPRESS_CHOICE_Y
from ..models import WatchFolderSource, WebFormSource, EmailBaseModel from ..models import WatchFolderSource, WebFormSource, EmailBaseModel
from .literals import (
TEST_EMAIL_ATTACHMENT_AND_INLINE, TEST_EMAIL_BASE64_FILENAME,
TEST_EMAIL_INLINE_IMAGE, TEST_EMAIL_NO_CONTENT_TYPE,
TEST_EMAIL_NO_CONTENT_TYPE_STRING
)
@override_settings(OCR_AUTO_OCR=False) @override_settings(OCR_AUTO_OCR=False)
class UploadDocumentTestCase(BaseTestCase): class UploadDocumentTestCase(BaseTestCase):
@@ -119,53 +125,68 @@ class CompressedUploadsTestCase(BaseTestCase):
) )
test_email = """From: noreply@example.com @override_settings(OCR_AUTO_OCR=False)
To: test@example.com
Subject: Scan to E-mail Server Job
Date: Tue, 23 May 2017 23:03:37 +0200
Message-Id: <00000001.465619c9.1.00@BRN30055CCF4D76>
Mime-Version: 1.0
Content-Type: multipart/mixed;
boundary="RS1tYWlsIENsaWVudA=="
X-Mailer: E-mail Client
This is multipart message.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain; charset=iso-8859-1
Content-Transfer-Encoding: quoted-printable
Sending device cannot receive e-mail replies.
--RS1tYWlsIENsaWVudA==
Content-Type: text/plain
Content-Transfer-Encoding: base64
Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?="
SGFsbG8gQW1wZWxtw6RubmNoZW4hCg==
--RS1tYWlsIENsaWVudA==--"""
class SourceStub():
subject_metadata_type = None
from_metadata_type = None
metadata_attachment_name = None
document_type = None
uncompress = None
store_body = False
label = ""
def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None,
metadata_dict_list=None, metadata_dictionary=None, tag_ids=None, user=None):
self.label = label
class EmailFilenameDecodingTestCase(BaseTestCase): class EmailFilenameDecodingTestCase(BaseTestCase):
""" def setUp(self):
Test decoding of base64 encoded e-mail attachment filename. super(EmailFilenameDecodingTestCase, self).setUp()
""" self.document_type = DocumentType.objects.create(
label=TEST_DOCUMENT_TYPE_LABEL
)
def test_decode_email_encoded_filename(self): def tearDown(self):
source_stub = SourceStub() self.document_type.delete()
EmailBaseModel.process_message(source_stub, test_email) super(EmailFilenameDecodingTestCase, self).tearDown()
self.assertEqual(source_stub.label, u'Ampelm\xe4nnchen.txt')
def _create_email_source(self):
self.source = EmailBaseModel(
document_type=self.document_type,
host='', username='', password='', store_body=True
)
def test_decode_email_base64_encoded_filename(self):
"""
Test decoding of base64 encoded e-mail attachment filename.
"""
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_BASE64_FILENAME
)
self.assertEqual(
Document.objects.first().label, 'Ampelm\xe4nnchen.txt'
)
def test_decode_email_no_content_type(self):
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_NO_CONTENT_TYPE
)
self.assertTrue(
TEST_EMAIL_NO_CONTENT_TYPE_STRING in Document.objects.first().open().read()
)
def test_decode_email_with_inline_image(self):
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_INLINE_IMAGE
)
self.assertTrue(Document.objects.count(), 2)
self.assertQuerysetEqual(
ordered=False, qs=Document.objects.all(), values=(
'<Document: test-01.png>', '<Document: email_body.html>'
),
)
def test_decode_email_with_attachment_and_inline_image(self):
self._create_email_source()
EmailBaseModel.process_message(
source=self.source, message_text=TEST_EMAIL_ATTACHMENT_AND_INLINE
)
self.assertTrue(Document.objects.count(), 2)
self.assertQuerysetEqual(
ordered=False, qs=Document.objects.all(), values=(
'<Document: test-01.png>', '<Document: email_body.html>',
'<Document: test-02.png>'
),
)

View File

@@ -24,6 +24,7 @@ djangorestframework==3.7.7
djangorestframework-recursive==0.1.2 djangorestframework-recursive==0.1.2
drf-yasg==1.5.0 drf-yasg==1.5.0
flanker==0.8.5
flex==6.12.0 flex==6.12.0
furl==1.0.1 furl==1.0.1
fusepy==2.0.4 fusepy==2.0.4

View File

@@ -61,40 +61,41 @@ Pillow==5.0.0
PyYAML==3.12 PyYAML==3.12
celery==3.1.24 celery==3.1.24
cssmin==0.2.0 cssmin==0.2.0
django-activity-stream==0.6.3 django-activity-stream==0.6.5
django-autoadmin==1.1.1 django-autoadmin==1.1.1
django-celery==3.2.1 django-celery==3.2.1
django-colorful==1.2 django-colorful==1.2
django-compressor==2.1 django-compressor==2.2
django-cors-headers==1.2.2 django-cors-headers==2.2.0
django-downloadview==1.9 django-downloadview==1.9
django-formtools==2.0 django-formtools==2.1
django-pure-pagination==0.3.0 django-pure-pagination==0.3.0
django-mathfilters==0.4.0 django-mathfilters==0.4.0
django-model-utils==2.6.1 django-model-utils==3.1.1
django-mptt>=0.8.7 django-mptt==0.9.0
django-qsstats-magic==0.7.2 django-qsstats-magic==1.0.0
django-rest-swagger==0.3.10 django-stronghold==0.3.0
django-stronghold==0.2.8 django-suit==0.2.26
django-suit==0.2.25
django-widget-tweaks==1.4.1 django-widget-tweaks==1.4.1
djangorestframework==3.7.7 djangorestframework==3.7.7
djangorestframework-recursive==0.1.2 djangorestframework-recursive==0.1.2
drf-yasg==1.5.0 drf-yasg==1.5.0
flanker==0.8.5
flex==6.12.0 flex==6.12.0
furl==1.0.1 furl==1.0.1
fusepy==2.0.4 fusepy==2.0.4
graphviz==0.8 graphviz==0.8.2
mock==2.0.0 mock==2.0.0
pycountry==1.20 pycountry==1.20
PyPDF2==1.26.0 PyPDF2==1.26.0
pyocr==0.4.5 pyocr==0.5.1
python-dateutil==2.5.3 python-dateutil==2.6.1
python-gnupg==0.3.9 python-gnupg==0.3.9
python-magic==0.4.13 python-magic==0.4.15
pytz==2016.7 pytz==2018.3
requests==2.18.4 requests==2.18.4
sh==1.12.11 sh==1.12.14
swagger-spec-validator==2.1.0
""".split() """.split()
with open('README.rst') as f: with open('README.rst') as f: