From 4d11234ba50fc9efb22e8ad37534255aad150627 Mon Sep 17 00:00:00 2001 From: Roberto Rosario Date: Thu, 7 Jun 2018 20:42:49 -0400 Subject: [PATCH] =?UTF-8?q?Fix=20error=20when=20trying=20to=20upload=20a?= =?UTF-8?q?=20document=20from=20and=20email=20account=20with=20'from'=20an?= =?UTF-8?q?d=20'subject'=20metadata.=20Fix=20typo=20on=20message.header=20?= =?UTF-8?q?get=20from=20'Suject'=20to=20'Subject'.=20On=20multi=20part=20e?= =?UTF-8?q?mails=20keep=20the=20original=20From=20and=20Subject=20properti?= =?UTF-8?q?es=20for=20all=20subsequent=20parts=20if=20the=20sub=20parts=20?= =?UTF-8?q?don't=20specify=20them.=20Fixes=20issue=20#481.=20Thanks=20to?= =?UTF-8?q?=20Robert=20Sch=C3=B6ftner=20@robert.schoeftner=20for=20the=20r?= =?UTF-8?q?eport=20and=20debug=20information.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Roberto Rosario --- HISTORY.rst | 4 + docs/releases/3.0.rst | 5 +- mayan/apps/sources/models.py | 63 ++++++++++----- mayan/apps/sources/tests/literals.py | 2 + mayan/apps/sources/tests/test_models.py | 100 +++++++++++++++++++++++- 5 files changed, 154 insertions(+), 20 deletions(-) diff --git a/HISTORY.rst b/HISTORY.rst index fd010e7f98..30d96d0f3d 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -170,6 +170,10 @@ - Change the error log file handle class to RotatingFileHandle to avoid an indefinitely growing log file. - Disable embedded signatute verification during the perform upgrade command. - Replace the DOCUMENTS_LANGUAGE_CHOICES setting option. Replaced with the new DOCUMENTS_LANGUAGE_CODES. +- Fix error when trying to upload a document from and email account with 'from' and 'subject' metadata. +- Fix typo on message.header get from 'Suject' to 'Subject'. +- On multi part emails keep the original From and Subject properties for all subsequent parts if the sub parts don't specify them. Fixes issue #481. Thanks to Robert Schöftner @robert.schoeftner for the report and debug information. + 2.7.3 (2017-09-11) diff --git a/docs/releases/3.0.rst b/docs/releases/3.0.rst index aaf2162ee4..ada943f87d 100644 --- a/docs/releases/3.0.rst +++ b/docs/releases/3.0.rst @@ -516,6 +516,9 @@ Other changes worth mentioning new DOCUMENTS_LANGUAGE_CODES. - Reduce default language code choice from 7800 to the top 100 spoken languages and related (https://en.wikipedia.org/wiki/List_of_languages_by_number_of_native_speakers) +- Fix error when trying to upload a document from and email account with 'from' and 'subject' metadata. +- Fix typo on message.header get from 'Suject' to 'Subject'. +- On multi part emails keep the original From and Subject properties for all subsequent parts if the sub parts don't specify them. Fixes issue #481. Thanks to Robert Schöftner @robert.schoeftner for the report and debug information. Removals -------- @@ -607,7 +610,7 @@ Bugs fixed or issues closed * `GitLab issue #467 `_ mail attachments without content-disposition are lost * `GitLab issue #468 `_ plain text e-mails without charset do not work * `GitLab issue #470 `_ Enable Django variable for HTML encoded emails - +* `GitLab issue #481 `_ IMAP sources with metadata not working in 3.0rc1 * `GitHub issue #264 `_ migrate fails on document_states 0004_workflow_internal_name * `GitHub issue #269 `_ Lack of authentication for document previews diff --git a/mayan/apps/sources/models.py b/mayan/apps/sources/models.py index c6b6ee4de5..8fe8f78078 100644 --- a/mayan/apps/sources/models.py +++ b/mayan/apps/sources/models.py @@ -80,6 +80,7 @@ class Source(models.Model): Handle an upload request from a file object which may be an individual document or a compressed file containing multiple documents. """ + documents = [] if not document_type: document_type = self.document_type @@ -94,16 +95,26 @@ class Source(models.Model): compressed_file = CompressedFile(file_object) for compressed_file_child in compressed_file.children(): kwargs.update({'label': force_text(compressed_file_child)}) - self.upload_document( - file_object=File(compressed_file_child), **kwargs + documents.append( + self.upload_document( + file_object=File(compressed_file_child), **kwargs + ) ) compressed_file_child.close() except NotACompressedFile: logging.debug('Exception: NotACompressedFile') - self.upload_document(file_object=file_object, **kwargs) + documents.append( + self.upload_document(file_object=file_object, **kwargs) + ) else: - self.upload_document(file_object=file_object, **kwargs) + documents.append( + self.upload_document(file_object=file_object, **kwargs) + ) + + # Return a list of newly created documents. Used by the email source + # to assign the from and subject metadata values. + return documents def get_upload_file_object(self, form_data): pass @@ -549,28 +560,42 @@ class EmailBaseModel(IntervalBaseModel): ) @staticmethod - def process_message(source, message_text): + def process_message(source, message_text, message_properties=None): from flanker import mime counter = 1 message = mime.from_string(force_str(message_text)) metadata_dictionary = {} + if not message_properties: + message_properties = {} + + message_properties['Subject'] = message_properties.get( + 'Subject', message.headers.get('Subject') + ) + + message_properties['From'] = message_properties.get( + 'From', message.headers.get('From') + ) + if source.subject_metadata_type: metadata_dictionary[ source.subject_metadata_type.name - ] = message.headers.get('Subjet') + ] = message_properties.get('Subject') if source.from_metadata_type: metadata_dictionary[ source.from_metadata_type.name - ] = message.headers.get('From') + ] = message_properties.get('From') # Messages are tree based, do nested processing of message parts until # a message with no children is found, then work out way up. if message.parts: for part in message.parts: - EmailBaseModel.process_message(source=source, message_text=part.to_string()) + EmailBaseModel.process_message( + source=source, message_text=part.to_string(), + message_properties=message_properties + ) else: # Treat inlines as attachments, both are extracted and saved as # documents @@ -585,17 +610,18 @@ class EmailBaseModel(IntervalBaseModel): 'Got metadata dictionary: %s', metadata_dictionary ) else: - document = source.handle_upload( + documents = source.handle_upload( document_type=source.document_type, file_object=file_object, expand=( source.uncompress == SOURCE_UNCOMPRESS_CHOICE_Y ) ) if metadata_dictionary: - set_bulk_metadata( - document=document, - metadata_dictionary=metadata_dictionary - ) + for document in documents: + set_bulk_metadata( + document=document, + metadata_dictionary=metadata_dictionary + ) else: # If it is not an attachment then it should be a body message part. # Another option is to use message.is_body() @@ -605,16 +631,17 @@ class EmailBaseModel(IntervalBaseModel): label = 'email_body.txt' with ContentFile(content=message.body, name=label) as file_object: - document = source.handle_upload( + documents = source.handle_upload( document_type=source.document_type, file_object=file_object, expand=SOURCE_UNCOMPRESS_CHOICE_N ) if metadata_dictionary: - set_bulk_metadata( - document=document, - metadata_dictionary=metadata_dictionary - ) + for document in documents: + set_bulk_metadata( + document=document, + metadata_dictionary=metadata_dictionary + ) class IMAPEmail(EmailBaseModel): diff --git a/mayan/apps/sources/tests/literals.py b/mayan/apps/sources/tests/literals.py index e831359995..d9f6556d86 100644 --- a/mayan/apps/sources/tests/literals.py +++ b/mayan/apps/sources/tests/literals.py @@ -64,6 +64,8 @@ Content-Disposition: attachment; filename="=?UTF-8?B?QW1wZWxtw6RubmNoZW4udHh0?=" SGFsbG8gQW1wZWxtw6RubmNoZW4hCg== --RS1tYWlsIENsaWVudA==--''' +TEST_EMAIL_BASE64_FILENAME_FROM = 'noreply@example.com' +TEST_EMAIL_BASE64_FILENAME_SUBJECT = 'Scan to E-mail Server Job' TEST_EMAIL_NO_CONTENT_TYPE = '''MIME-Version: 1.0 Received: by 10.0.0.1 with HTTP; Mon, 9 Apr 2018 00:00:00 -0400 (AST) X-Originating-IP: [10.0.0.1] diff --git a/mayan/apps/sources/tests/test_models.py b/mayan/apps/sources/tests/test_models.py index 55dcd95a97..66ed8a8e27 100644 --- a/mayan/apps/sources/tests/test_models.py +++ b/mayan/apps/sources/tests/test_models.py @@ -14,14 +14,16 @@ from documents.tests import ( TEST_NON_ASCII_DOCUMENT_FILENAME, TEST_NON_ASCII_DOCUMENT_PATH, TEST_NON_ASCII_COMPRESSED_DOCUMENT_PATH ) +from metadata.models import MetadataType from ..literals import SOURCE_UNCOMPRESS_CHOICE_Y from ..models import ( - EmailBaseModel, POP3Email, WatchFolderSource, WebFormSource + EmailBaseModel, IMAPEmail, POP3Email, WatchFolderSource, WebFormSource ) from .literals import ( TEST_EMAIL_ATTACHMENT_AND_INLINE, TEST_EMAIL_BASE64_FILENAME, + TEST_EMAIL_BASE64_FILENAME_FROM, TEST_EMAIL_BASE64_FILENAME_SUBJECT, TEST_EMAIL_INLINE_IMAGE, TEST_EMAIL_NO_CONTENT_TYPE, TEST_EMAIL_NO_CONTENT_TYPE_STRING ) @@ -129,6 +131,102 @@ class EmailFilenameDecodingTestCase(BaseTestCase): ), ) + def test_decode_email_and_store_from_and_subject_as_metadata(self): + metadata_from = MetadataType.objects.create(name='from') + metadata_subject = MetadataType.objects.create(name='subject') + self.document_type.metadata.create(metadata_type=metadata_from) + self.document_type.metadata.create(metadata_type=metadata_subject) + + self._create_email_source() + self.source.from_metadata_type = metadata_from + self.source.subject_metadata_type = metadata_subject + self.source.save() + + EmailBaseModel.process_message( + source=self.source, message_text=TEST_EMAIL_BASE64_FILENAME + ) + + document = Document.objects.first() + + self.assertEqual( + document.label, 'Ampelm\xe4nnchen.txt' + ) + self.assertEqual( + document.metadata.get(metadata_type=metadata_from).value, + TEST_EMAIL_BASE64_FILENAME_FROM + ) + self.assertEqual( + document.metadata.get(metadata_type=metadata_subject).value, + TEST_EMAIL_BASE64_FILENAME_SUBJECT + ) + + +@override_settings(OCR_AUTO_OCR=False) +class IMAPSourceTestCase(BaseTestCase): + class MockIMAP4_SSL(object): + #def dele(self, which): + # return + + #def getwelcome(self): + # return + + #def list(self, which=None): + # return (None, ['1 test']) + + #def pass_(self, password): + # return + + #def quit(self): + # return + + #def retr(self, which=None): + # return ( + # 1, [TEST_EMAIL_BASE64_FILENAME] + # ) + + def fetch(self, message_set, message_parts): + return 'STATUS', '(1 BODY[{}])'.format(TEST_EMAIL_ATTACHMENT_AND_INLINE) + #status, data = mailbox.fetch(message_number, '(RFC822)') + #EmailBaseModel.process_message( + # source=self, message_text=data[0][1] + #) + #mailbox.store(message_number, '+FLAGS', '\\Deleted') + + def login(self, username, password): + return + + def search(self, charset, *criterion): + return (None, ['1']) + + def select(self, mailbox): + return + + def user(self, username): + return + + def setUp(self): + super(IMAPSourceTestCase, self).setUp() + self.document_type = DocumentType.objects.create( + label=TEST_DOCUMENT_TYPE_LABEL + ) + + def tearDown(self): + self.document_type.delete() + super(IMAPSourceTestCase, self).tearDown() + + @mock.patch('imaplib.IMAP4_SSL') + def test_download_document(self, mock_imaplib): + mock_imaplib.return_value = IMAPSourceTestCase.MockIMAP4_SSL() + self.source = IMAPEmail.objects.create( + document_type=self.document_type, label='', host='', password='', + username='' + ) + + self.source.check_source() + self.assertEqual( + Document.objects.first().label, 'Ampelm\xe4nnchen.txt' + ) + @override_settings(OCR_AUTO_OCR=False) class POP3SourceTestCase(BaseTestCase):