Support ignoring certain rows
Signed-off-by: Roberto Rosario <roberto.rosario.gonzalez@gmail.com>
This commit is contained in:
@@ -34,6 +34,11 @@ class Command(management.BaseCommand):
|
|||||||
help='Don\'t stop the import process on common errors like '
|
help='Don\'t stop the import process on common errors like '
|
||||||
'incorrect file paths.',
|
'incorrect file paths.',
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--ignore_rows',
|
||||||
|
action='store', dest='ignore_rows', default='',
|
||||||
|
help='Ignore a set of rows. Row numbers must be separated by commas.'
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--metadata_pairs_column',
|
'--metadata_pairs_column',
|
||||||
action='store', dest='metadata_pairs_column',
|
action='store', dest='metadata_pairs_column',
|
||||||
@@ -49,6 +54,10 @@ class Command(management.BaseCommand):
|
|||||||
document_types = {}
|
document_types = {}
|
||||||
uploaded_count = 0
|
uploaded_count = 0
|
||||||
row_count = 0
|
row_count = 0
|
||||||
|
rows_to_ignore = []
|
||||||
|
for entry in options['ignore_rows'].split(','):
|
||||||
|
if entry:
|
||||||
|
rows_to_ignore.append(int(entry))
|
||||||
|
|
||||||
DocumentType = apps.get_model(
|
DocumentType = apps.get_model(
|
||||||
app_label='documents', model_name='DocumentType'
|
app_label='documents', model_name='DocumentType'
|
||||||
@@ -64,71 +73,75 @@ class Command(management.BaseCommand):
|
|||||||
with open(options['filelist']) as csv_datafile:
|
with open(options['filelist']) as csv_datafile:
|
||||||
csv_reader = csv.reader(csv_datafile)
|
csv_reader = csv.reader(csv_datafile)
|
||||||
for row in csv_reader:
|
for row in csv_reader:
|
||||||
|
# Increase row count here even though start index is 0
|
||||||
|
# purpose is to avoid losing row number increments on
|
||||||
|
# exceptions
|
||||||
row_count = row_count + 1
|
row_count = row_count + 1
|
||||||
try:
|
if row_count - 1 not in rows_to_ignore:
|
||||||
with open(row[options['document_path_column']]) as file_object:
|
try:
|
||||||
document_type_label = row[options['document_type_column']]
|
with open(row[options['document_path_column']]) as file_object:
|
||||||
|
document_type_label = row[options['document_type_column']]
|
||||||
|
|
||||||
if document_type_label not in document_types:
|
if document_type_label not in document_types:
|
||||||
self.stdout.write(
|
self.stdout.write(
|
||||||
'New document type: {}. Creating and caching.'.format(
|
'New document type: {}. Creating and caching.'.format(
|
||||||
document_type_label
|
document_type_label
|
||||||
|
)
|
||||||
|
)
|
||||||
|
document_type, created = DocumentType.objects.get_or_create(
|
||||||
|
label=document_type_label
|
||||||
|
)
|
||||||
|
document_types[document_type_label] = document_type
|
||||||
|
else:
|
||||||
|
document_type = document_types[document_type_label]
|
||||||
|
|
||||||
|
shared_uploaded_file = SharedUploadedFile.objects.create(
|
||||||
|
file=File(file_object)
|
||||||
|
)
|
||||||
|
|
||||||
|
extra_data = {}
|
||||||
|
if options['metadata_pairs_column']:
|
||||||
|
extra_data['metadata_pairs'] = []
|
||||||
|
|
||||||
|
for pair in options['metadata_pairs_column'].split(','):
|
||||||
|
name, value = pair.split(':')
|
||||||
|
extra_data['metadata_pairs'].append(
|
||||||
|
{
|
||||||
|
'name': row[int(name)],
|
||||||
|
'value': row[int(value)]
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
task_upload_new_document.apply_async(
|
||||||
|
kwargs=dict(
|
||||||
|
document_type_id=document_type.pk,
|
||||||
|
shared_uploaded_file_id=shared_uploaded_file.pk,
|
||||||
|
extra_data=extra_data
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
document_type, created = DocumentType.objects.get_or_create(
|
|
||||||
label=document_type_label
|
uploaded_count = uploaded_count + 1
|
||||||
)
|
|
||||||
document_types[document_type_label] = document_type
|
if (time.time() - time_last_display) > 1:
|
||||||
|
time_last_display = time.time()
|
||||||
|
self.stdout.write(
|
||||||
|
'Time: {}s, Files copied and queued: {}, files processed per second: {}'.format(
|
||||||
|
int(time.time() - time_start),
|
||||||
|
uploaded_count,
|
||||||
|
uploaded_count / (time.time() - time_start)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
except (IOError, OSError) as exception:
|
||||||
|
if not options['ignore_errors']:
|
||||||
|
raise
|
||||||
else:
|
else:
|
||||||
document_type = document_types[document_type_label]
|
self.stderr.write(
|
||||||
|
'Error processing row: {}; {}.'.format(
|
||||||
shared_uploaded_file = SharedUploadedFile.objects.create(
|
row_count - 1, exception
|
||||||
file=File(file_object)
|
|
||||||
)
|
|
||||||
|
|
||||||
extra_data = {}
|
|
||||||
if options['metadata_pairs_column']:
|
|
||||||
extra_data['metadata_pairs'] = []
|
|
||||||
|
|
||||||
for pair in options['metadata_pairs_column'].split(','):
|
|
||||||
name, value = pair.split(':')
|
|
||||||
extra_data['metadata_pairs'].append(
|
|
||||||
{
|
|
||||||
'name': row[int(name)],
|
|
||||||
'value': row[int(value)]
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
task_upload_new_document.apply_async(
|
|
||||||
kwargs=dict(
|
|
||||||
document_type_id=document_type.pk,
|
|
||||||
shared_uploaded_file_id=shared_uploaded_file.pk,
|
|
||||||
extra_data=extra_data
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
uploaded_count = uploaded_count + 1
|
|
||||||
|
|
||||||
if (time.time() - time_last_display) > 1:
|
|
||||||
time_last_display = time.time()
|
|
||||||
self.stdout.write(
|
|
||||||
'Time: {}s, Files copied and queued: {}, files processed per second: {}'.format(
|
|
||||||
int(time.time() - time_start),
|
|
||||||
uploaded_count,
|
|
||||||
uploaded_count / (time.time() - time_start)
|
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
except (IOError, OSError) as exception:
|
|
||||||
if not options['ignore_errors']:
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
self.stderr.write(
|
|
||||||
'Error processing row: {}; {}.'.format(
|
|
||||||
row_count - 1, exception
|
|
||||||
)
|
|
||||||
)
|
|
||||||
|
|
||||||
self.stdout.write(
|
self.stdout.write(
|
||||||
'Total files copied and queues: {}'.format(uploaded_count)
|
'Total files copied and queues: {}'.format(uploaded_count)
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user