Improve email metadata support

The feature can now work on emails with nested parts.
Also the metadata.yaml attachment no longer needs to be the
first attachment.

Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
This commit is contained in:
Roberto Rosario
2019-06-29 02:12:54 -04:00
parent 305f4d1afd
commit 24dcdfd328
4 changed files with 123 additions and 39 deletions

View File

@@ -20,6 +20,9 @@
trash.
* Remove the INSTALLED_APPS setting. Replace it with
the new COMMON_EXTRA_APPS and COMMON_DISABLED_APPS.
* Improve email metadata support. Can now work on
email with nested parts. Also the metadata.yaml
attachment no longer needs to be the first attachment.
3.2.3 (2019-06-21)
==================

View File

@@ -34,6 +34,9 @@ Changes
trash.
- Remove the INSTALLED_APPS setting. Replace it with
the new COMMON_EXTRA_APPS and COMMON_DISABLED_APPS.
- Improve email metadata support. Can now work on
email with nested parts. Also the metadata.yaml
attachment no longer needs to be the first attachment.
Removals
--------
@@ -53,7 +56,7 @@ Remove deprecated requirements::
Type in the console::
$ pip install mayan-edms==3.2.3
$ pip install mayan-edms==3.2.4
the requirements will also be updated automatically.

View File

@@ -16,6 +16,7 @@ from django.db import models
from django.utils.encoding import force_bytes
from django.utils.translation import ugettext_lazy as _
from mayan.apps.documents.models import Document
from mayan.apps.metadata.api import set_bulk_metadata
from mayan.apps.metadata.models import MetadataType
@@ -54,8 +55,7 @@ class EmailBaseModel(IntervalBaseModel):
help_text=_(
'Name of the attachment that will contains the metadata type '
'names and value pairs to be assigned to the rest of the '
'downloaded attachments. Note: This attachment has to be the '
'first attachment.'
'downloaded attachments.'
), max_length=128, verbose_name=_('Metadata attachment name')
)
subject_metadata_type = models.ForeignKey(
@@ -85,52 +85,61 @@ class EmailBaseModel(IntervalBaseModel):
verbose_name_plural = _('Email sources')
@staticmethod
def process_message(source, message_text, message_properties=None):
def process_message(source, message_text):
from flanker import mime
counter = 1
message = mime.from_string(force_bytes(message_text))
metadata_dictionary = {}
if not message_properties:
message_properties = {}
message_properties['Subject'] = message_properties.get(
'Subject', message.headers.get('Subject')
)
message_properties['From'] = message_properties.get(
'From', message.headers.get('From')
)
if source.subject_metadata_type:
metadata_dictionary[
source.subject_metadata_type.name
] = message_properties.get('Subject')
message = mime.from_string(force_bytes(message_text))
if source.from_metadata_type:
metadata_dictionary[
source.from_metadata_type.name
] = message_properties.get('From')
] = message.headers.get('From')
if source.subject_metadata_type:
metadata_dictionary[
source.subject_metadata_type.name
] = message.headers.get('Subject')
document_ids, parts_metadata_dictionary = EmailBaseModel._process_message(source=source, message=message)
metadata_dictionary.update(parts_metadata_dictionary)
if metadata_dictionary:
for document in Document.objects.filter(id__in=document_ids):
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
@staticmethod
def _process_message(source, message):
counter = 1
document_ids = []
metadata_dictionary = {}
# Messages are tree based, do nested processing of message parts until
# a message with no children is found, then work out way up.
if message.parts:
for part in message.parts:
EmailBaseModel.process_message(
source=source, message_text=part.to_string(),
message_properties=message_properties
part_document_ids, part_metadata_dictionary = EmailBaseModel._process_message(
source=source, message=part,
)
document_ids.extend(part_document_ids)
metadata_dictionary.update(part_metadata_dictionary)
else:
# Treat inlines as attachments, both are extracted and saved as
# documents
if message.is_attachment() or message.is_inline():
# Reject zero length attachments
if len(message.body) == 0:
return
return document_ids, metadata_dictionary
label = message.detected_file_name or 'attachment-{}'.format(counter)
counter = counter + 1
with ContentFile(content=message.body, name=label) as file_object:
if label == source.metadata_attachment_name:
metadata_dictionary = yaml.load(
@@ -147,12 +156,10 @@ class EmailBaseModel(IntervalBaseModel):
source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y
)
)
if metadata_dictionary:
for document in documents:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
for document in documents:
document_ids.append(document.pk)
else:
# If it is not an attachment then it should be a body message part.
# Another option is to use message.is_body()
@@ -168,12 +175,11 @@ class EmailBaseModel(IntervalBaseModel):
expand=SOURCE_UNCOMPRESS_CHOICE_N,
file_object=file_object
)
if metadata_dictionary:
for document in documents:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
for document in documents:
document_ids.append(document.pk)
return document_ids, metadata_dictionary
def clean(self):
if self.subject_metadata_type:

View File

@@ -6,7 +6,13 @@ import shutil
import mock
from pathlib2 import Path
import yaml
try:
from yaml import CSafeDumper as SafeDumper
except ImportError:
from yaml import SafeDumper
from django.core import mail
from django.utils.encoding import force_text
from mayan.apps.documents.models import Document
@@ -190,6 +196,72 @@ class EmailBaseTestCase(GenericDocumentTestCase):
# Only two attachments and a body document
self.assertEqual(2, Document.objects.count())
def test_metadata_yaml_attachment(self):
TEST_METADATA_VALUE_1 = 'test value 1'
TEST_METADATA_VALUE_2 = 'test value 2'
test_metadata_type_1 = MetadataType.objects.create(
name='test_metadata_type_1'
)
test_metadata_type_2 = MetadataType.objects.create(
name='test_metadata_type_2'
)
self.test_document_type.metadata.create(
metadata_type=test_metadata_type_1
)
self.test_document_type.metadata.create(
metadata_type=test_metadata_type_2
)
test_metadata_yaml = yaml.dump(
Dumper=SafeDumper, data={
test_metadata_type_1.name: TEST_METADATA_VALUE_1,
test_metadata_type_2.name: TEST_METADATA_VALUE_2,
}
)
# Create email with a test attachment first, then the metadata.yaml
# attachment
with mail.get_connection(
backend='django.core.mail.backends.locmem.EmailBackend'
) as connection:
email_message = mail.EmailMultiAlternatives(
body='test email body', connection=connection,
subject='test email subject', to=['test@example.com'],
)
email_message.attach(
filename='test_attachment',
content='test_content',
)
email_message.attach(
filename='metadata.yaml',
content=test_metadata_yaml,
)
email_message.send()
self._create_email_source()
self.source.store_body = True
self.source.save()
EmailBaseModel.process_message(
source=self.source, message_text=mail.outbox[0].message()
)
self.assertEqual(Document.objects.count(), 2)
for document in Document.objects.all():
self.assertEqual(
document.metadata.get(metadata_type=test_metadata_type_1).value,
TEST_METADATA_VALUE_1
)
self.assertEqual(
document.metadata.get(metadata_type=test_metadata_type_2).value,
TEST_METADATA_VALUE_2
)
class IMAPSourceTestCase(GenericDocumentTestCase):
auto_upload_document = False