Add support for including a metadata dictionary as an attachment for the POP3 and IMAP sources.

This commit is contained in:
Roberto Rosario
2015-08-13 03:05:55 -04:00
parent 5bb8f779b3
commit 28c45bf988
5 changed files with 82 additions and 17 deletions

View File

@@ -119,3 +119,18 @@ def convert_dict_to_dict_list(dictionary):
result.append({'id': metadata_type.pk, 'value': value}) result.append({'id': metadata_type.pk, 'value': value})
return result return result
def set_bulk_metadata(document, metadata_dictionary):
document_type = document.document_type
document_type_metadata_types = [
document_type_metadata_type.metadata_type for document_type_metadata_type in document_type.metadata.all()
]
for metadata_type_name, value in metadata_dictionary.items():
metadata_type = MetadataType.objects.get(name=metadata_type_name)
if metadata_type in document_type_metadata_types:
DocumentMetadata.objects.get_or_create(
document=document, metadata_type=metadata_type, value=value
)

View File

@@ -94,7 +94,8 @@ class POP3EmailSetupForm(EmailSetupBaseForm):
class Meta(EmailSetupBaseForm.Meta): class Meta(EmailSetupBaseForm.Meta):
fields = ( fields = (
'label', 'enabled', 'interval', 'document_type', 'uncompress', 'label', 'enabled', 'interval', 'document_type', 'uncompress',
'host', 'ssl', 'port', 'username', 'password', 'timeout' 'host', 'ssl', 'port', 'username', 'password', 'timeout',
'metadata_attachment_name',
) )
model = POP3Email model = POP3Email
@@ -103,7 +104,8 @@ class IMAPEmailSetupForm(EmailSetupBaseForm):
class Meta(EmailSetupBaseForm.Meta): class Meta(EmailSetupBaseForm.Meta):
fields = ( fields = (
'label', 'enabled', 'interval', 'document_type', 'uncompress', 'label', 'enabled', 'interval', 'document_type', 'uncompress',
'host', 'ssl', 'port', 'username', 'password', 'mailbox' 'host', 'ssl', 'port', 'username', 'password', 'mailbox',
'metadata_attachment_name'
) )
model = IMAPEmail model = IMAPEmail

View File

@@ -32,6 +32,7 @@ SOURCE_CHOICES = (
) )
DEFAULT_INTERVAL = 600 DEFAULT_INTERVAL = 600
DEFAULT_METADATA_ATTACHMENT_NAME = 'metadata.yaml'
DEFAULT_POP3_TIMEOUT = 60 DEFAULT_POP3_TIMEOUT = 60
DEFAULT_IMAP_MAILBOX = 'INBOX' DEFAULT_IMAP_MAILBOX = 'INBOX'
DEFAULT_SOURCE_TASK_RETRY_DELAY = 10 DEFAULT_SOURCE_TASK_RETRY_DELAY = 10

View File

@@ -0,0 +1,20 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from django.db import models, migrations
class Migration(migrations.Migration):
dependencies = [
('sources', '0006_auto_20150708_0330'),
]
operations = [
migrations.AddField(
model_name='emailbasemodel',
name='metadata_attachment_name',
field=models.CharField(default='metadata.yaml', help_text='Name of the attachment that will contains the metadata types and values to be assigned to the rest of the downloaded attachments.', max_length=128, verbose_name='Metadata attachment name'),
preserve_default=True,
),
]

View File

@@ -8,6 +8,8 @@ import logging
import os import os
import poplib import poplib
import yaml
from django.core.files import File from django.core.files import File
from django.db import models, transaction from django.db import models, transaction
from django.utils.encoding import python_2_unicode_compatible from django.utils.encoding import python_2_unicode_compatible
@@ -21,15 +23,16 @@ from converter.models import Transformation
from djcelery.models import PeriodicTask, IntervalSchedule from djcelery.models import PeriodicTask, IntervalSchedule
from documents.models import Document, DocumentType from documents.models import Document, DocumentType
from documents.settings import setting_language from documents.settings import setting_language
from metadata.api import save_metadata_list from metadata.api import save_metadata_list, set_bulk_metadata
from .classes import Attachment, SourceUploadedFile, StagingFile from .classes import Attachment, SourceUploadedFile, StagingFile
from .literals import ( from .literals import (
DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX, DEFAULT_INTERVAL, DEFAULT_POP3_TIMEOUT, DEFAULT_IMAP_MAILBOX,
SOURCE_CHOICES, SOURCE_CHOICE_STAGING, SOURCE_CHOICE_WATCH, DEFAULT_METADATA_ATTACHMENT_NAME, SOURCE_CHOICES, SOURCE_CHOICE_STAGING,
SOURCE_CHOICE_WEB_FORM, SOURCE_INTERACTIVE_UNCOMPRESS_CHOICES, SOURCE_CHOICE_WATCH, SOURCE_CHOICE_WEB_FORM,
SOURCE_UNCOMPRESS_CHOICES, SOURCE_UNCOMPRESS_CHOICE_Y, SOURCE_INTERACTIVE_UNCOMPRESS_CHOICES, SOURCE_UNCOMPRESS_CHOICES,
SOURCE_CHOICE_EMAIL_IMAP, SOURCE_CHOICE_EMAIL_POP3 SOURCE_UNCOMPRESS_CHOICE_Y, SOURCE_CHOICE_EMAIL_IMAP,
SOURCE_CHOICE_EMAIL_POP3
) )
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@@ -52,7 +55,7 @@ class Source(models.Model):
def fullname(self): def fullname(self):
return ' '.join([self.class_fullname(), '"%s"' % self.label]) return ' '.join([self.class_fullname(), '"%s"' % self.label])
def upload_document(self, file_object, document_type, description=None, label=None, language=None, metadata_dict_list=None, user=None): def upload_document(self, file_object, document_type, description=None, label=None, language=None, metadata_dict_list=None, metadata_dictionary=None, user=None):
try: try:
with transaction.atomic(): with transaction.atomic():
document = Document.objects.create( document = Document.objects.create(
@@ -75,6 +78,12 @@ class Source(models.Model):
metadata_dict_list, document, create=True metadata_dict_list, document, create=True
) )
if metadata_dictionary:
set_bulk_metadata(
document=document,
metadata_dictionary=metadata_dictionary
)
except Exception as exception: except Exception as exception:
logger.critical( logger.critical(
'Unexpected exception while trying to create new document "%s" from source "%s"; %s', 'Unexpected exception while trying to create new document "%s" from source "%s"; %s',
@@ -82,14 +91,15 @@ class Source(models.Model):
) )
raise raise
def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None, metadata_dict_list=None, user=None): def handle_upload(self, file_object, description=None, document_type=None, expand=False, label=None, language=None, metadata_dict_list=None, metadata_dictionary=None, user=None):
if not document_type: if not document_type:
document_type = self.document_type document_type = self.document_type
kwargs = { kwargs = {
'description': description, 'document_type': document_type, 'description': description, 'document_type': document_type,
'label': label, 'language': language, 'label': label, 'language': language,
'metadata_dict_list': metadata_dict_list, 'user': user 'metadata_dict_list': metadata_dict_list,
'metadata_dictionary': metadata_dictionary, 'user': user
} }
if expand: if expand:
@@ -312,14 +322,23 @@ class EmailBaseModel(IntervalBaseModel):
) )
username = models.CharField(max_length=96, verbose_name=_('Username')) username = models.CharField(max_length=96, verbose_name=_('Username'))
password = models.CharField(max_length=96, verbose_name=_('Password')) password = models.CharField(max_length=96, verbose_name=_('Password'))
metadata_attachment_name = models.CharField(
default=DEFAULT_METADATA_ATTACHMENT_NAME,
help_text=_(
'Name of the attachment that will contains the metadata type names '
'and value pairs to be assigned to the rest of the downloaded '
'attachments. Note: This attachment has to be the first attachment.'
), max_length=128, verbose_name=_('Metadata attachment name')
)
# From: http://bookmarks.honewatson.com/2009/08/11/python-gmail-imaplib-search-subject-get-attachments/ # From: http://bookmarks.honewatson.com/2009/08/11/python-gmail-imaplib-search-subject-get-attachments/
# TODO: Add lock to avoid running more than once concurrent same document download # TODO: Add lock to avoid running more than once concurrent same document download
# TODO: Use message ID for lock # TODO: Use message ID for lock
@staticmethod @staticmethod
def process_message(source, message): def process_message(source, message):
email = message_from_string(message)
counter = 1 counter = 1
email = message_from_string(message)
metadata_dictionary = None
for part in email.walk(): for part in email.walk():
disposition = part.get('Content-Disposition', 'none') disposition = part.get('Content-Disposition', 'none')
@@ -336,11 +355,19 @@ class EmailBaseModel(IntervalBaseModel):
logger.debug('filename: %s', filename) logger.debug('filename: %s', filename)
file_object = Attachment(part, name=filename) with Attachment(part, name=filename) as file_object:
if filename == source.metadata_attachment_name:
metadata_dictionary = yaml.safe_load(file_object.read())
logger.debug(
'Got metadata dictionary: %s', metadata_dictionary
)
else:
source.handle_upload( source.handle_upload(
document_type=source.document_type, document_type=source.document_type,
file_object=file_object, label=filename, file_object=file_object, label=filename,
expand=(source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y) expand=(
source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y
), metadata_dictionary=metadata_dictionary
) )
class Meta: class Meta: