Associate metadata to all types

Previously metadata types were associated to documents types if the metadata type was newly created. Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
Merge branch 'versions/minor' into feature/document_importer
2019-06-21 17:37:00 -04:00 · 2019-06-21 11:53:09 -04:00 · 2019-06-21 11:53:03 -04:00 · 2019-06-21 00:06:49 -04:00 · 2019-06-20 10:12:53 -04:00 · 2019-06-20 10:05:24 -04:00
15 changed files with 430 additions and 2 deletions
--- a/HISTORY.rst
+++ b/HISTORY.rst
@@ -7,6 +7,8 @@
 ==================
 * Add support for disabling the random primary key
  test mixin.
+* Add a reusable task to upload documents.
+* Add MVP of the importer app. 
 * Fix mailing profile log columns mappings.
  GitLab issue #626. Thanks to Jesaja Everling (@jeverling)
  for the report.
--- a/docs/releases/3.2.3.rst
+++ b/docs/releases/3.2.3.rst
@@ -19,7 +19,6 @@ Changes
  GitLab issue #625. Thanks to Jesaja Everling (@jeverling)
  for the report and the research.

-
 Removals
 --------

--- a/mayan/apps/documents/literals.py
+++ b/mayan/apps/documents/literals.py
@@ -32,6 +32,7 @@ DEFAULT_DOCUMENT_TYPE_LABEL = _('Default')
 DOCUMENT_IMAGE_TASK_TIMEOUT = 120
 STUB_EXPIRATION_INTERVAL = 60 * 60 * 24  # 24 hours
 UPDATE_PAGE_COUNT_RETRY_DELAY = 10
+UPLOAD_NEW_DOCUMENT_RETRY_DELAY = 10
 UPLOAD_NEW_VERSION_RETRY_DELAY = 10

 PAGE_RANGE_ALL = 'all'
--- a/mayan/apps/documents/queues.py
+++ b/mayan/apps/documents/queues.py
@@ -82,3 +82,7 @@ queue_uploads.add_task_type(
    dotted_path='mayan.apps.documents.tasks.task_scan_duplicates_for',
    label=_('Scan document duplicates')
 )
+queue_uploads.add_task_type(
+    dotted_path='mayan.apps.documents.tasks.task_upload_new_document',
+    label=_('Upload new document')
+)
--- a/mayan/apps/documents/tasks.py
+++ b/mayan/apps/documents/tasks.py
@@ -9,7 +9,8 @@ from django.db import OperationalError
 from mayan.celery import app

 from .literals import (
-    UPDATE_PAGE_COUNT_RETRY_DELAY, UPLOAD_NEW_VERSION_RETRY_DELAY
+    UPDATE_PAGE_COUNT_RETRY_DELAY, UPLOAD_NEW_DOCUMENT_RETRY_DELAY,
+    UPLOAD_NEW_VERSION_RETRY_DELAY
 )

 logger = logging.getLogger(__name__)
@@ -127,6 +128,60 @@ def task_update_page_count(self, version_id):
        raise self.retry(exc=exception)


+@app.task(bind=True, default_retry_delay=UPLOAD_NEW_DOCUMENT_RETRY_DELAY, ignore_result=True)
+def task_upload_new_document(self, document_type_id, shared_uploaded_file_id):
+    DocumentType = apps.get_model(
+        app_label='documents', model_name='DocumentType'
+    )
+
+    SharedUploadedFile = apps.get_model(
+        app_label='common', model_name='SharedUploadedFile'
+    )
+
+    try:
+        document_type = DocumentType.objects.get(pk=document_type_id)
+        shared_file = SharedUploadedFile.objects.get(
+            pk=shared_uploaded_file_id
+        )
+    except OperationalError as exception:
+        logger.warning(
+            'Operational error during attempt to retrieve shared data for '
+            'new document of type: %s; %s. Retrying.', document_type, exception
+        )
+        raise self.retry(exc=exception)
+
+    try:
+        with shared_file.open() as file_object:
+            document_type.new_document(file_object=file_object)
+    except OperationalError as exception:
+        logger.warning(
+            'Operational error during attempt to create new document '
+            'of type: %s; %s. Retrying.', document_type, exception
+        )
+        raise self.retry(exc=exception)
+    except Exception as exception:
+        # This except and else block emulate a finally:
+        logger.error(
+            'Unexpected error during attempt to create new document '
+            'of type: %s; %s', document_type, exception
+        )
+        try:
+            shared_file.delete()
+        except OperationalError as exception:
+            logger.warning(
+                'Operational error during attempt to delete shared '
+                'file: %s; %s.', shared_file, exception
+            )
+    else:
+        try:
+            shared_file.delete()
+        except OperationalError as exception:
+            logger.warning(
+                'Operational error during attempt to delete shared '
+                'file: %s; %s.', shared_file, exception
+            )
+
+
@app.task(bind=True, default_retry_delay=UPLOAD_NEW_VERSION_RETRY_DELAY, ignore_result=True)
 def task_upload_new_version(self, document_id, shared_uploaded_file_id, user_id, comment=None):
    SharedUploadedFile = apps.get_model(
--- a/mayan/apps/importer/init.py
+++ b/mayan/apps/importer/init.py
@@ -0,0 +1,3 @@
+from __future__ import unicode_literals
+
+default_app_config = 'mayan.apps.importer.apps.ImporterApp'
--- a/mayan/apps/importer/apps.py
+++ b/mayan/apps/importer/apps.py
@@ -0,0 +1,17 @@
+from __future__ import absolute_import, unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from mayan.apps.common.apps import MayanAppConfig
+
+
+class ImporterApp(MayanAppConfig):
+    app_namespace = 'importer'
+    app_url = 'importer'
+    has_rest_api = False
+    has_tests = True
+    name = 'mayan.apps.importer'
+    verbose_name = _('Importer')
+
+    def ready(self):
+        super(ImporterApp, self).ready()
--- a/mayan/apps/importer/management/init.py
+++ b/mayan/apps/importer/management/init.py
--- a/mayan/apps/importer/management/commands/init.py
+++ b/mayan/apps/importer/management/commands/init.py
--- a/mayan/apps/importer/management/commands/import.py
+++ b/mayan/apps/importer/management/commands/import.py
@@ -0,0 +1,150 @@
+from __future__ import unicode_literals
+
+import csv
+import time
+
+from django.apps import apps
+from django.core import management
+from django.core.files import File
+
+from ...tasks import task_upload_new_document
+
+
+class Command(management.BaseCommand):
+    help = 'Import documents from a CSV file.'
+
+    def add_arguments(self, parser):
+        parser.add_argument(
+            '--document_type_column',
+            action='store', dest='document_type_column', default=0,
+            help='Column that contains the document type labels. Column '
+            'numbers start at 0.',
+            type=int
+        )
+        parser.add_argument(
+            '--document_path_column',
+            action='store', dest='document_path_column', default=1,
+            help='Column that contains the path to the document files. Column '
+            'numbers start at 0.',
+            type=int
+        )
+        parser.add_argument(
+            '--ignore_errors',
+            action='store_true', dest='ignore_errors', default=False,
+            help='Don\'t stop the import process on common errors like '
+            'incorrect file paths.',
+        )
+        parser.add_argument(
+            '--ignore_rows',
+            action='store', dest='ignore_rows', default='',
+            help='Ignore a set of rows. Row numbers must be separated by commas.'
+        )
+        parser.add_argument(
+            '--metadata_pairs_column',
+            action='store', dest='metadata_pairs_column',
+            help='Column that contains metadata name and values for the '
+            'documents. Use the form: <label column>:<value column>. Example: '
+            '2:5. Separate multiple pairs with commas. Example: 2:5,7:10',
+        )
+        parser.add_argument('filelist', nargs='?', help='File list')
+
+    def handle(self, *args, **options):
+        time_start = time.time()
+        time_last_display = time_start
+        document_types = {}
+        uploaded_count = 0
+        row_count = 0
+        rows_to_ignore = []
+        for entry in options['ignore_rows'].split(','):
+            if entry:
+                rows_to_ignore.append(int(entry))
+
+        DocumentType = apps.get_model(
+            app_label='documents', model_name='DocumentType'
+        )
+        SharedUploadedFile = apps.get_model(
+            app_label='common', model_name='SharedUploadedFile'
+        )
+
+        if not options['filelist']:
+            self.stderr.write('Must specify a CSV file path.')
+            exit(1)
+        else:
+            with open(options['filelist']) as csv_datafile:
+                csv_reader = csv.reader(csv_datafile)
+                for row in csv_reader:
+                    # Increase row count here even though start index is 0
+                    # purpose is to avoid losing row number increments on
+                    # exceptions
+                    row_count = row_count + 1
+                    if row_count - 1 not in rows_to_ignore:
+                        try:
+                            with open(row[options['document_path_column']]) as file_object:
+                                document_type_label = row[options['document_type_column']]
+
+                                if document_type_label not in document_types:
+                                    self.stdout.write(
+                                        'New document type: {}. Creating and caching.'.format(
+                                            document_type_label
+                                        )
+                                    )
+                                    document_type, created = DocumentType.objects.get_or_create(
+                                        label=document_type_label
+                                    )
+                                    document_types[document_type_label] = document_type
+                                else:
+                                    document_type = document_types[document_type_label]
+
+                                shared_uploaded_file = SharedUploadedFile.objects.create(
+                                    file=File(file_object)
+                                )
+
+                                extra_data = {}
+                                if options['metadata_pairs_column']:
+                                    extra_data['metadata_pairs'] = []
+
+                                    for pair in options['metadata_pairs_column'].split(','):
+                                        name, value = pair.split(':')
+                                        extra_data['metadata_pairs'].append(
+                                            {
+                                                'name': row[int(name)],
+                                                'value': row[int(value)]
+                                            }
+                                        )
+
+                                task_upload_new_document.apply_async(
+                                    kwargs=dict(
+                                        document_type_id=document_type.pk,
+                                        shared_uploaded_file_id=shared_uploaded_file.pk,
+                                        extra_data=extra_data
+                                    )
+                                )
+
+                                uploaded_count = uploaded_count + 1
+
+                                if (time.time() - time_last_display) > 1:
+                                    time_last_display = time.time()
+                                    self.stdout.write(
+                                        'Time: {}s, Files copied and queued: {}, files processed per second: {}'.format(
+                                            int(time.time() - time_start),
+                                            uploaded_count,
+                                            uploaded_count / (time.time() - time_start)
+                                        )
+                                    )
+
+                        except (IOError, OSError) as exception:
+                            if not options['ignore_errors']:
+                                raise
+                            else:
+                                self.stderr.write(
+                                    'Error processing row: {}; {}.'.format(
+                                        row_count - 1, exception
+                                    )
+                                )
+
+            self.stdout.write(
+                'Total files copied and queues: {}'.format(uploaded_count)
+            )
+            self.stdout.write(
+                'Total time: {}'.format(time.time() - time_start)
+            )
--- a/mayan/apps/importer/queues.py
+++ b/mayan/apps/importer/queues.py
@@ -0,0 +1,10 @@
+from __future__ import absolute_import, unicode_literals
+
+from django.utils.translation import ugettext_lazy as _
+
+from mayan.apps.documents.queues import queue_uploads
+
+queue_uploads.add_task_type(
+    dotted_path='mayan.apps.importer.tasks.task_upload_new_document',
+    label=_('Import new document')
+)
--- a/mayan/apps/importer/tasks.py
+++ b/mayan/apps/importer/tasks.py
@@ -0,0 +1,92 @@
+from __future__ import unicode_literals
+
+import logging
+
+from django.apps import apps
+from django.db import OperationalError
+from django.utils.text import slugify
+
+from mayan.celery import app
+
+from mayan.apps.documents.literals import UPLOAD_NEW_DOCUMENT_RETRY_DELAY
+
+logger = logging.getLogger(__name__)
+
+
+@app.task(bind=True, default_retry_delay=UPLOAD_NEW_DOCUMENT_RETRY_DELAY, ignore_result=True)
+def task_upload_new_document(self, document_type_id, shared_uploaded_file_id, extra_data=None):
+    DocumentType = apps.get_model(
+        app_label='documents', model_name='DocumentType'
+    )
+
+    MetadataType = apps.get_model(
+        app_label='metadata', model_name='MetadataType'
+    )
+
+    SharedUploadedFile = apps.get_model(
+        app_label='common', model_name='SharedUploadedFile'
+    )
+
+    try:
+        document_type = DocumentType.objects.get(pk=document_type_id)
+        shared_file = SharedUploadedFile.objects.get(
+            pk=shared_uploaded_file_id
+        )
+    except OperationalError as exception:
+        logger.warning(
+            'Operational error during attempt to retrieve shared data for '
+            'new document of type: %s; %s. Retrying.', document_type, exception
+        )
+        raise self.retry(exc=exception)
+
+    try:
+        with shared_file.open() as file_object:
+            new_document = document_type.new_document(file_object=file_object)
+    except OperationalError as exception:
+        logger.warning(
+            'Operational error during attempt to create new document '
+            'of type: %s; %s. Retrying.', document_type, exception
+        )
+        raise self.retry(exc=exception)
+    except Exception as exception:
+        # This except and else block emulate a finally:
+        logger.error(
+            'Unexpected error during attempt to create new document '
+            'of type: %s; %s', document_type, exception
+        )
+        try:
+            shared_file.delete()
+        except OperationalError as exception:
+            logger.warning(
+                'Operational error during attempt to delete shared '
+                'file: %s; %s.', shared_file, exception
+            )
+    else:
+        if extra_data:
+            for pair in extra_data.get('metadata_pairs', []):
+                name = slugify(pair['name']).replace('-', '_')
+                logger.debug(
+                    'Metadata pair (label, name, value): %s, %s, %s',
+                    pair['name'], name, pair['value']
+                )
+
+                metadata_type, created = MetadataType.objects.get_or_create(
+                    label=pair['name'], defaults={'name': name}
+                )
+                if not new_document.document_type.metadata.filter(metadata_type=metadata_type).exists():
+                    logger.debug('Metadata type created')
+                    new_document.document_type.metadata.create(
+                        metadata_type=metadata_type, required=False
+                    )
+
+                new_document.metadata.create(
+                    metadata_type=metadata_type, value=pair['value']
+                )
+
+        try:
+            shared_file.delete()
+        except OperationalError as exception:
+            logger.warning(
+                'Operational error during attempt to delete shared '
+                'file: %s; %s.', shared_file, exception
+            )
--- a/mayan/apps/importer/tests/init.py
+++ b/mayan/apps/importer/tests/init.py
--- a/mayan/apps/importer/tests/test_management_commands.py
+++ b/mayan/apps/importer/tests/test_management_commands.py
@@ -0,0 +1,94 @@
+from __future__ import unicode_literals
+
+import csv
+
+from django.core import management
+from django.utils.encoding import force_bytes
+
+from mayan.apps.documents.models import DocumentType, Document
+from mayan.apps.documents.tests import GenericDocumentTestCase
+from mayan.apps.documents.tests.literals import TEST_SMALL_DOCUMENT_PATH
+from mayan.apps.storage.utils import fs_cleanup, mkstemp
+
+
+class ImportManagementCommandTestCase(GenericDocumentTestCase):
+    auto_upload_document = False
+    random_primary_key_enable = False
+    test_import_count = 1
+
+    def setUp(self):
+        super(ImportManagementCommandTestCase, self).setUp()
+        self._create_test_csv_file()
+
+    def tearDown(self):
+        self._destroy_test_csv_file()
+        super(ImportManagementCommandTestCase, self).tearDown()
+
+    def _create_test_csv_file(self):
+        self.test_csv_file_descriptor, self.test_csv_path = mkstemp()
+
+        print('Test CSV file: {}'.format(self.test_csv_path))
+
+        with open(self.test_csv_path, mode='wb') as csvfile:
+            filewriter = csv.writer(
+                csvfile, delimiter=force_bytes(','), quotechar=force_bytes('"'),
+                quoting=csv.QUOTE_MINIMAL
+            )
+            print(
+                'Generating test CSV for {} documents'.format(
+                    self.test_import_count
+                )
+            )
+            for times in range(self.test_import_count):
+                filewriter.writerow(
+                    [
+                        self.test_document_type.label, TEST_SMALL_DOCUMENT_PATH,
+                        'column 2', 'column 3', 'column 4', 'column 5',
+                        'column 6', 'column 7', 'column 8', 'column 9',
+                        'column 10', 'column 11',
+                    ]
+                )
+
+    def _destroy_test_csv_file(self):
+        fs_cleanup(
+            filename=self.test_csv_path,
+            file_descriptor=self.test_csv_file_descriptor
+        )
+
+    def test_import_csv_read(self):
+        self.test_document_type.delete()
+        management.call_command('import', self.test_csv_path)
+
+        self.assertTrue(DocumentType.objects.count() > 0)
+        self.assertTrue(Document.objects.count() > 0)
+
+    def test_import_document_type_column_mapping(self):
+        self.test_document_type.delete()
+        management.call_command(
+            'import', self.test_csv_path, '--document_type_column', '2'
+        )
+
+        self.assertTrue(DocumentType.objects.first().label == 'column 2')
+        self.assertTrue(Document.objects.count() > 0)
+
+    def test_import_document_path_column_mapping(self):
+        self.test_document_type.delete()
+        with self.assertRaises(IOError):
+            management.call_command(
+                'import', self.test_csv_path, '--document_path_column', '2'
+            )
+
+    def test_import_metadata_column_mapping(self):
+        self.test_document_type.delete()
+        management.call_command(
+            'import', self.test_csv_path, '--metadata_pairs_column', '2:3,4:5',
+        )
+
+        self.assertTrue(DocumentType.objects.count() > 0)
+        self.assertTrue(Document.objects.count() > 0)
+        self.assertTrue(Document.objects.first().metadata.count() > 0)
+        self.assertEqual(
+            Document.objects.first().metadata.get(
+                metadata_type__name='column_2'
+            ).value, 'column 3'
+        )
--- a/mayan/settings/base.py
+++ b/mayan/settings/base.py
@@ -120,6 +120,7 @@ INSTALLED_APPS = (
    'mayan.apps.document_states',
    'mayan.apps.documents',
    'mayan.apps.file_metadata',
+    'mayan.apps.importer',
    'mayan.apps.linking',
    'mayan.apps.mailer',
    'mayan.apps.mayan_statistics',
Author	SHA1	Message	Date
Roberto Rosario	39eabe1c54	Associate metadata to all types Previously metadata types were associated to documents types if the metadata type was newly created. Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>	2019-06-21 17:37:00 -04:00
Roberto Rosario	6fc9e46882	Merge branch 'versions/minor' into feature/document_importer	2019-06-21 11:53:09 -04:00
Roberto Rosario	2d326a679d	Merge branch 'master' into feature/document_importer	2019-06-21 11:53:03 -04:00
Roberto Rosario	aa8c2db446	Merge branch 'master' into feature/document_importer Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>	2019-06-21 00:06:49 -04:00
Roberto Rosario	925b55d76d	Support ignoring certain rows Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>	2019-06-20 10:12:53 -04:00
Roberto Rosario	5808d3653d	Add support for ignoring import errors Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>	2019-06-20 10:05:24 -04:00
Roberto Rosario	bc072f7b7e	Add column mapping support Add support for specifying metadata columns. Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>	2019-06-19 17:47:32 -04:00
Roberto Rosario	b3d59eee39	Add MVP of the importer app Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>	2019-06-19 16:02:00 -04:00
Roberto Rosario	7d379a52af	Add a reusable task to upload documents Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>	2019-06-19 16:00:59 -04:00
Roberto Rosario	499ab1f3e7	Allow disabling the random primary key test mixin Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>	2019-06-19 15:59:15 -04:00