Compare commits

...

10 Commits

Author SHA1 Message Date
Roberto Rosario
39eabe1c54 Associate metadata to all types
Previously metadata types were associated to documents types
if the metadata type was newly created.

Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
2019-06-21 17:37:00 -04:00
Roberto Rosario
6fc9e46882 Merge branch 'versions/minor' into feature/document_importer 2019-06-21 11:53:09 -04:00
Roberto Rosario
2d326a679d Merge branch 'master' into feature/document_importer 2019-06-21 11:53:03 -04:00
Roberto Rosario
aa8c2db446 Merge branch 'master' into feature/document_importer
Signed-off-by: Roberto Rosario <roberto.rosario@mayan-edms.com>
2019-06-21 00:06:49 -04:00
Roberto Rosario
925b55d76d Support ignoring certain rows
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-06-20 10:12:53 -04:00
Roberto Rosario
5808d3653d Add support for ignoring import errors
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-06-20 10:05:24 -04:00
Roberto Rosario
bc072f7b7e Add column mapping support
Add support for specifying metadata columns.

Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-06-19 17:47:32 -04:00
Roberto Rosario
b3d59eee39 Add MVP of the importer app
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-06-19 16:02:00 -04:00
Roberto Rosario
7d379a52af Add a reusable task to upload documents
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-06-19 16:00:59 -04:00
Roberto Rosario
499ab1f3e7 Allow disabling the random primary key test mixin
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
2019-06-19 15:59:15 -04:00
15 changed files with 430 additions and 2 deletions

View File

@@ -7,6 +7,8 @@
==================
* Add support for disabling the random primary key
test mixin.
* Add a reusable task to upload documents.
* Add MVP of the importer app.
* Fix mailing profile log columns mappings.
GitLab issue #626. Thanks to Jesaja Everling (@jeverling)
for the report.

View File

@@ -19,7 +19,6 @@ Changes
GitLab issue #625. Thanks to Jesaja Everling (@jeverling)
for the report and the research.
Removals
--------

View File

@@ -32,6 +32,7 @@ DEFAULT_DOCUMENT_TYPE_LABEL = _('Default')
DOCUMENT_IMAGE_TASK_TIMEOUT = 120
STUB_EXPIRATION_INTERVAL = 60 * 60 * 24 # 24 hours
UPDATE_PAGE_COUNT_RETRY_DELAY = 10
UPLOAD_NEW_DOCUMENT_RETRY_DELAY = 10
UPLOAD_NEW_VERSION_RETRY_DELAY = 10
PAGE_RANGE_ALL = 'all'

View File

@@ -82,3 +82,7 @@ queue_uploads.add_task_type(
dotted_path='mayan.apps.documents.tasks.task_scan_duplicates_for',
label=_('Scan document duplicates')
)
queue_uploads.add_task_type(
dotted_path='mayan.apps.documents.tasks.task_upload_new_document',
label=_('Upload new document')
)

View File

@@ -9,7 +9,8 @@ from django.db import OperationalError
from mayan.celery import app
from .literals import (
UPDATE_PAGE_COUNT_RETRY_DELAY, UPLOAD_NEW_VERSION_RETRY_DELAY
UPDATE_PAGE_COUNT_RETRY_DELAY, UPLOAD_NEW_DOCUMENT_RETRY_DELAY,
UPLOAD_NEW_VERSION_RETRY_DELAY
)
logger = logging.getLogger(__name__)
@@ -127,6 +128,60 @@ def task_update_page_count(self, version_id):
raise self.retry(exc=exception)
@app.task(bind=True, default_retry_delay=UPLOAD_NEW_DOCUMENT_RETRY_DELAY, ignore_result=True)
def task_upload_new_document(self, document_type_id, shared_uploaded_file_id):
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
SharedUploadedFile = apps.get_model(
app_label='common', model_name='SharedUploadedFile'
)
try:
document_type = DocumentType.objects.get(pk=document_type_id)
shared_file = SharedUploadedFile.objects.get(
pk=shared_uploaded_file_id
)
except OperationalError as exception:
logger.warning(
'Operational error during attempt to retrieve shared data for '
'new document of type: %s; %s. Retrying.', document_type, exception
)
raise self.retry(exc=exception)
try:
with shared_file.open() as file_object:
document_type.new_document(file_object=file_object)
except OperationalError as exception:
logger.warning(
'Operational error during attempt to create new document '
'of type: %s; %s. Retrying.', document_type, exception
)
raise self.retry(exc=exception)
except Exception as exception:
# This except and else block emulate a finally:
logger.error(
'Unexpected error during attempt to create new document '
'of type: %s; %s', document_type, exception
)
try:
shared_file.delete()
except OperationalError as exception:
logger.warning(
'Operational error during attempt to delete shared '
'file: %s; %s.', shared_file, exception
)
else:
try:
shared_file.delete()
except OperationalError as exception:
logger.warning(
'Operational error during attempt to delete shared '
'file: %s; %s.', shared_file, exception
)
@app.task(bind=True, default_retry_delay=UPLOAD_NEW_VERSION_RETRY_DELAY, ignore_result=True)
def task_upload_new_version(self, document_id, shared_uploaded_file_id, user_id, comment=None):
SharedUploadedFile = apps.get_model(

View File

@@ -0,0 +1,3 @@
from __future__ import unicode_literals
default_app_config = 'mayan.apps.importer.apps.ImporterApp'

View File

@@ -0,0 +1,17 @@
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from mayan.apps.common.apps import MayanAppConfig
class ImporterApp(MayanAppConfig):
app_namespace = 'importer'
app_url = 'importer'
has_rest_api = False
has_tests = True
name = 'mayan.apps.importer'
verbose_name = _('Importer')
def ready(self):
super(ImporterApp, self).ready()

View File

@@ -0,0 +1,150 @@
from __future__ import unicode_literals
import csv
import time
from django.apps import apps
from django.core import management
from django.core.files import File
from ...tasks import task_upload_new_document
class Command(management.BaseCommand):
help = 'Import documents from a CSV file.'
def add_arguments(self, parser):
parser.add_argument(
'--document_type_column',
action='store', dest='document_type_column', default=0,
help='Column that contains the document type labels. Column '
'numbers start at 0.',
type=int
)
parser.add_argument(
'--document_path_column',
action='store', dest='document_path_column', default=1,
help='Column that contains the path to the document files. Column '
'numbers start at 0.',
type=int
)
parser.add_argument(
'--ignore_errors',
action='store_true', dest='ignore_errors', default=False,
help='Don\'t stop the import process on common errors like '
'incorrect file paths.',
)
parser.add_argument(
'--ignore_rows',
action='store', dest='ignore_rows', default='',
help='Ignore a set of rows. Row numbers must be separated by commas.'
)
parser.add_argument(
'--metadata_pairs_column',
action='store', dest='metadata_pairs_column',
help='Column that contains metadata name and values for the '
'documents. Use the form: <label column>:<value column>. Example: '
'2:5. Separate multiple pairs with commas. Example: 2:5,7:10',
)
parser.add_argument('filelist', nargs='?', help='File list')
def handle(self, *args, **options):
time_start = time.time()
time_last_display = time_start
document_types = {}
uploaded_count = 0
row_count = 0
rows_to_ignore = []
for entry in options['ignore_rows'].split(','):
if entry:
rows_to_ignore.append(int(entry))
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
SharedUploadedFile = apps.get_model(
app_label='common', model_name='SharedUploadedFile'
)
if not options['filelist']:
self.stderr.write('Must specify a CSV file path.')
exit(1)
else:
with open(options['filelist']) as csv_datafile:
csv_reader = csv.reader(csv_datafile)
for row in csv_reader:
# Increase row count here even though start index is 0
# purpose is to avoid losing row number increments on
# exceptions
row_count = row_count + 1
if row_count - 1 not in rows_to_ignore:
try:
with open(row[options['document_path_column']]) as file_object:
document_type_label = row[options['document_type_column']]
if document_type_label not in document_types:
self.stdout.write(
'New document type: {}. Creating and caching.'.format(
document_type_label
)
)
document_type, created = DocumentType.objects.get_or_create(
label=document_type_label
)
document_types[document_type_label] = document_type
else:
document_type = document_types[document_type_label]
shared_uploaded_file = SharedUploadedFile.objects.create(
file=File(file_object)
)
extra_data = {}
if options['metadata_pairs_column']:
extra_data['metadata_pairs'] = []
for pair in options['metadata_pairs_column'].split(','):
name, value = pair.split(':')
extra_data['metadata_pairs'].append(
{
'name': row[int(name)],
'value': row[int(value)]
}
)
task_upload_new_document.apply_async(
kwargs=dict(
document_type_id=document_type.pk,
shared_uploaded_file_id=shared_uploaded_file.pk,
extra_data=extra_data
)
)
uploaded_count = uploaded_count + 1
if (time.time() - time_last_display) > 1:
time_last_display = time.time()
self.stdout.write(
'Time: {}s, Files copied and queued: {}, files processed per second: {}'.format(
int(time.time() - time_start),
uploaded_count,
uploaded_count / (time.time() - time_start)
)
)
except (IOError, OSError) as exception:
if not options['ignore_errors']:
raise
else:
self.stderr.write(
'Error processing row: {}; {}.'.format(
row_count - 1, exception
)
)
self.stdout.write(
'Total files copied and queues: {}'.format(uploaded_count)
)
self.stdout.write(
'Total time: {}'.format(time.time() - time_start)
)

View File

@@ -0,0 +1,10 @@
from __future__ import absolute_import, unicode_literals
from django.utils.translation import ugettext_lazy as _
from mayan.apps.documents.queues import queue_uploads
queue_uploads.add_task_type(
dotted_path='mayan.apps.importer.tasks.task_upload_new_document',
label=_('Import new document')
)

View File

@@ -0,0 +1,92 @@
from __future__ import unicode_literals
import logging
from django.apps import apps
from django.db import OperationalError
from django.utils.text import slugify
from mayan.celery import app
from mayan.apps.documents.literals import UPLOAD_NEW_DOCUMENT_RETRY_DELAY
logger = logging.getLogger(__name__)
@app.task(bind=True, default_retry_delay=UPLOAD_NEW_DOCUMENT_RETRY_DELAY, ignore_result=True)
def task_upload_new_document(self, document_type_id, shared_uploaded_file_id, extra_data=None):
DocumentType = apps.get_model(
app_label='documents', model_name='DocumentType'
)
MetadataType = apps.get_model(
app_label='metadata', model_name='MetadataType'
)
SharedUploadedFile = apps.get_model(
app_label='common', model_name='SharedUploadedFile'
)
try:
document_type = DocumentType.objects.get(pk=document_type_id)
shared_file = SharedUploadedFile.objects.get(
pk=shared_uploaded_file_id
)
except OperationalError as exception:
logger.warning(
'Operational error during attempt to retrieve shared data for '
'new document of type: %s; %s. Retrying.', document_type, exception
)
raise self.retry(exc=exception)
try:
with shared_file.open() as file_object:
new_document = document_type.new_document(file_object=file_object)
except OperationalError as exception:
logger.warning(
'Operational error during attempt to create new document '
'of type: %s; %s. Retrying.', document_type, exception
)
raise self.retry(exc=exception)
except Exception as exception:
# This except and else block emulate a finally:
logger.error(
'Unexpected error during attempt to create new document '
'of type: %s; %s', document_type, exception
)
try:
shared_file.delete()
except OperationalError as exception:
logger.warning(
'Operational error during attempt to delete shared '
'file: %s; %s.', shared_file, exception
)
else:
if extra_data:
for pair in extra_data.get('metadata_pairs', []):
name = slugify(pair['name']).replace('-', '_')
logger.debug(
'Metadata pair (label, name, value): %s, %s, %s',
pair['name'], name, pair['value']
)
metadata_type, created = MetadataType.objects.get_or_create(
label=pair['name'], defaults={'name': name}
)
if not new_document.document_type.metadata.filter(metadata_type=metadata_type).exists():
logger.debug('Metadata type created')
new_document.document_type.metadata.create(
metadata_type=metadata_type, required=False
)
new_document.metadata.create(
metadata_type=metadata_type, value=pair['value']
)
try:
shared_file.delete()
except OperationalError as exception:
logger.warning(
'Operational error during attempt to delete shared '
'file: %s; %s.', shared_file, exception
)

View File

View File

@@ -0,0 +1,94 @@
from __future__ import unicode_literals
import csv
from django.core import management
from django.utils.encoding import force_bytes
from mayan.apps.documents.models import DocumentType, Document
from mayan.apps.documents.tests import GenericDocumentTestCase
from mayan.apps.documents.tests.literals import TEST_SMALL_DOCUMENT_PATH
from mayan.apps.storage.utils import fs_cleanup, mkstemp
class ImportManagementCommandTestCase(GenericDocumentTestCase):
auto_upload_document = False
random_primary_key_enable = False
test_import_count = 1
def setUp(self):
super(ImportManagementCommandTestCase, self).setUp()
self._create_test_csv_file()
def tearDown(self):
self._destroy_test_csv_file()
super(ImportManagementCommandTestCase, self).tearDown()
def _create_test_csv_file(self):
self.test_csv_file_descriptor, self.test_csv_path = mkstemp()
print('Test CSV file: {}'.format(self.test_csv_path))
with open(self.test_csv_path, mode='wb') as csvfile:
filewriter = csv.writer(
csvfile, delimiter=force_bytes(','), quotechar=force_bytes('"'),
quoting=csv.QUOTE_MINIMAL
)
print(
'Generating test CSV for {} documents'.format(
self.test_import_count
)
)
for times in range(self.test_import_count):
filewriter.writerow(
[
self.test_document_type.label, TEST_SMALL_DOCUMENT_PATH,
'column 2', 'column 3', 'column 4', 'column 5',
'column 6', 'column 7', 'column 8', 'column 9',
'column 10', 'column 11',
]
)
def _destroy_test_csv_file(self):
fs_cleanup(
filename=self.test_csv_path,
file_descriptor=self.test_csv_file_descriptor
)
def test_import_csv_read(self):
self.test_document_type.delete()
management.call_command('import', self.test_csv_path)
self.assertTrue(DocumentType.objects.count() > 0)
self.assertTrue(Document.objects.count() > 0)
def test_import_document_type_column_mapping(self):
self.test_document_type.delete()
management.call_command(
'import', self.test_csv_path, '--document_type_column', '2'
)
self.assertTrue(DocumentType.objects.first().label == 'column 2')
self.assertTrue(Document.objects.count() > 0)
def test_import_document_path_column_mapping(self):
self.test_document_type.delete()
with self.assertRaises(IOError):
management.call_command(
'import', self.test_csv_path, '--document_path_column', '2'
)
def test_import_metadata_column_mapping(self):
self.test_document_type.delete()
management.call_command(
'import', self.test_csv_path, '--metadata_pairs_column', '2:3,4:5',
)
self.assertTrue(DocumentType.objects.count() > 0)
self.assertTrue(Document.objects.count() > 0)
self.assertTrue(Document.objects.first().metadata.count() > 0)
self.assertEqual(
Document.objects.first().metadata.get(
metadata_type__name='column_2'
).value, 'column 3'
)

View File

@@ -120,6 +120,7 @@ INSTALLED_APPS = (
'mayan.apps.document_states',
'mayan.apps.documents',
'mayan.apps.file_metadata',
'mayan.apps.importer',
'mayan.apps.linking',
'mayan.apps.mailer',
'mayan.apps.mayan_statistics',