-
Notifications
You must be signed in to change notification settings - Fork 1
/
records.py
348 lines (276 loc) · 13.2 KB
/
records.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
#! /usr/bin/env python3
"""
DCAN Labs NDA BIDS preparation tool
Created 02/20/2020 Eric Earl ([email protected])
"""
import argparse
import csv
import math
import os
import subprocess
import sys
import yaml
import pandas as pd
from glob import glob
HERE = os.path.dirname(os.path.realpath(__file__))
__doc__ = """
This python command-line tool allows the user to do
more automated NDA BIDS data upload preparation
given an input folder structure hierarchy that obeys
the DCAN Labs NDA BIDS preparation standard.
"""
def generate_parser():
parser = argparse.ArgumentParser(
prog='records.py',
description=__doc__
)
parser.add_argument(
'-p', '--parent', dest='parent', metavar='PARENT_DIR', type=str, required=True,
help=('Path to the "parent" folder to be prepared for upload. "Parent" '
'folder should be of the format: ".../ndastructure_type.class.subset" '
'containing subfolders called "sub-subject_ses-session.type.class.subset" '
'where "ndastructure" is fmriresults01, image03, or imagingcollection01, '
'"subject" is the BIDS subject ID/participant label, '
'"session" is the BIDS session ID, "type" is either "inputs" or '
'"derivatives", "class" is "anat", "dwi", "fmap", "func", or something '
'similar, and "subset" is the user-defined "data subset type".'
'For example: "image03_inputs.anat.T1w" containing subfolders like '
'"sub-NDARABC123_ses-baseline.inputs.anat.T1w"')
)
parser.add_argument(
'-l', '--lookup', dest='lookup_csv', type=str, required=True
)
parser.add_argument(
'-y', '--yaml-dir', dest='yaml_dir', type=str, required=True
)
return parser
# Sanity check against user inputs
def records_sanity_check(input, lookup_csv, yaml_dir):
# check if input is a directory
if not os.path.isdir(input):
print(input + ' is not a directory! Exiting...')
sys.exit(1)
else:
parent = os.path.abspath(os.path.realpath(input))
dest_dir = os.path.dirname(parent)
#lookup_csv = os.path.join(dest_dir, 'lookup.csv')
manifest_script = os.path.join(HERE, 'manifest-data', 'nda_manifests.py')
# check if manifest_script exists
if not os.path.isfile(manifest_script):
print(manifest_script + ' is not a file, contained a directory above "parent" in a directory called manifest-data. Exiting...')
sys.exit(2)
# check if lookup_csv exists
if not os.path.isfile(lookup_csv):
print(lookup_csv + ' is not a file, contained a directory above "parent" called "lookup.csv". Exiting...')
sys.exit(3)
# grab parent's basename
basename = os.path.basename(parent)
nda_struct, file_config = basename.split('_', 1)
if not ( nda_struct == 'fmriresults01' or nda_struct == 'image03' or nda_struct == 'imagingcollection01' ):
print(basename + ' is not a valid entry for section A. Improper parent folder name. Exiting...')
sys.exit(4)
if file_config.count('.') != 2:
print(file_config + ' is an improper parent folder naming convention. The parent folder MUST only contain two periods total. Exiting...')
sys.exit(5)
else:
input_deriv, subsets, types = file_config.split('.')
if not ( input_deriv == 'inputs' or input_deriv == 'derivatives' or input_deriv == 'sourcedata' ):
print(input_deriv + ' is not a valid entry for section X. Section X MUST be either "inputs", "derivatives", or "sourcedata". Improper parent folder name. Exiting...')
sys.exit(6)
if subsets.count('_') != 0:
print(subsets + ' is not a valid entry for section Y. Section Y MUST have no underscores. Improper parent folder name. Exiting...')
sys.exit(7)
problem_child_flag = False
for root, dirs, files in os.walk(parent):
if root == parent:
for directory in dirs:
if not directory.startswith('sub-NDAR'):
problem_child_flag = True
print('Improper child folder name: ' + directory + '. Child directories MUST start with "sub-NDAR". Exiting after full check...')
else:
sub_ses, sub_directory_config = directory.split('.', 1)
if sub_directory_config != file_config:
problem_child_flag = True
print('Improper child folder name. Sections X.Y.Z MUST match between parent and child folders. Exiting after full check...')
if problem_child_flag:
sys.exit(8)
# compare basename to available "content" YAML files
content_yamls = [content for content in glob(os.path.join(yaml_dir, '*.yaml'))
if os.path.basename(content) == basename + '.yaml' ]
if not len(content_yamls) == 1:
if len(content_yamls) > 1:
print('More than one content file matches your parent directory\'s basename (' + basename + '):')
for content in content_yamls:
print(' ' + content)
print('This should never happen. Please debug records.py.')
elif len(content_yamls) == 0:
print('No content .yaml files in ' + dest_dir + ' match the basename: ' + basename)
print('Make sure a matching content .yaml file exists in the folder above the parent folder you provided.')
print('Exiting...')
sys.exit(9)
else:
content_yaml = content_yamls[0]
# sanity-check entries in correct content YAML file
print('Sanity-checking: ' + content_yaml)
with open(content_yaml, 'r') as f:
content = yaml.load(f, Loader=yaml.CLoader)
badflag = False
for key in content:
value = content[key]
if len(value) == 0:
badflag = True
print('Empty field in ' + content_yaml + ':')
print(' ' + key + ': "' + value + '"')
if badflag:
print('No empty fields allowed in content .yaml files. Exiting...')
sys.exit(10)
def cli(input, lookup_csv, yaml_dir):
# setting easy use variables from argparse
parent = os.path.abspath(os.path.realpath(input))
dest_dir = os.path.dirname(parent)
manifest_script = os.path.join(HERE, 'manifest-data', 'nda_manifests.py')
#lookup_csv = os.path.join(dest_dir, 'lookup.csv')
# grab parent's basename
basename = os.path.basename(parent)
# start an empty data template
if basename.startswith('fmriresults01'):
ndaheader = '"fmriresults","01"'
with open(os.path.join(HERE, 'templates', 'fmriresults01_template.csv'), 'r') as f:
reader = csv.reader(f)
for i,row in enumerate(reader):
if i==1:
header = row
elif basename.startswith('imagingcollection01'):
ndaheader = '"imagingcollection","01"'
with open(os.path.join(HERE, 'templates', 'imagingcollection01_template.csv'), 'r') as f:
reader = csv.reader(f)
for i,row in enumerate(reader):
if i==1:
header = row
elif basename.startswith('image03'):
ndaheader = '"image","03"'
with open(os.path.join(HERE, 'templates', 'image03_template.csv'), 'r') as f:
reader = csv.reader(f)
for i,row in enumerate(reader):
if i==1:
header = row
# grabbing yaml
content_yamls = [content for content in glob(os.path.join(yaml_dir, '*.yaml'))
if os.path.basename(content) == basename + '.yaml' ]
content_yaml = content_yamls[0]
# sanity-check entries in correct content YAML file
with open(content_yaml, 'r') as f:
content = yaml.load(f, Loader=yaml.CLoader)
# load lookup CSV file
#with open(lookup_csv,'r') as f:
# lookup = [row for row in csv.DictReader(f)]
lookup_df = pd.read_csv(lookup_csv)
# get subject list
with open(os.path.join(os.path.dirname(parent), 'subject_list.csv')) as f:
reader = csv.reader(f)
subject_list = list(reader)[1:]
for subject_session in subject_list:
row = lookup_df.loc[(lookup_df['bids_subject_id'] == subject_session[0]) & (lookup_df['bids_session_id'] == subject_session[1])]
if len(row) == 1:
continue
else:
print('WARNING: {} {} not found in {}'.format(subject_session[0], subject_session[1], lookup_csv))
subject_list.remove(subject_session)
### DO WORK ###
# 1. GLOB all .../ndastructure_type.class.subset/sub-subject_ses-session.type.class.subset/ folders
uploads = glob(os.path.join(parent, '*.*.*.*'))
# 2. loop over the folders
subprocess.call(('echo `date` Creating NDA records'), shell=True)
records = []
folders = []
for upload_dir in uploads:
# skip to the next iteration of the for loop if the upload_dir is not a directory
if not os.path.isdir(upload_dir):
continue
# create an NDA record for each folder using the content YAML file
upload_basename = os.path.basename(upload_dir)
bids_subject_session, datatype, dataclass, datasubset = upload_basename.split('.')
record_found = False
for subject_session in subject_list:
if '_'.join(subject_session) == bids_subject_session:
row = lookup_df.loc[(lookup_df['bids_subject_id'] == subject_session[0]) & (lookup_df['bids_session_id'] == subject_session[1])]
if len(row) != 1:
print('WARNING: {} {} not found in {}'.format(subject_session[0], subject_session[1], lookup_csv))
subject_list.remove(subject_session)
continue
lookup_record = row.to_dict(orient='records')[0]
record_found = True
break
if not record_found:
continue
# nda-manifest each folder
manifest_file = '.'.join([upload_dir, 'manifest', 'json'])
subprocess.call(' '.join(['python3', manifest_script, '-id', '.', '-of', manifest_file]),
shell=True, cwd=upload_dir, stdout=subprocess.DEVNULL)
# correct the manifest contents to remove the leading "./" from each manifest element
subprocess.call('sed -i "s|\./||g" ' + manifest_file, shell=True)
# write the new record for entry into the larger output CSV
new_record = {}
for column in header:
if column in content:
new_record[column] = content[column]
else:
new_record[column] = ''
if basename.startswith('fmriresults01') or basename.startswith('image03'):
new_record['manifest'] = os.path.basename(manifest_file)
new_record['image_description'] = '.'.join([datatype, dataclass, datasubset])
elif basename.startswith('imagingcollection01'):
new_record['image_manifest'] = os.path.basename(manifest_file)
new_record['image_collection_desc'] = '.'.join([datatype, dataclass, datasubset])
for column in lookup_record:
if column != 'bids_subject_id' and column != 'bids_session_id':
new_record[column] = lookup_record[column]
records.append(new_record)
folders.append(upload_dir)
with open(parent + '.complete_records.csv', 'w') as f:
f.write(ndaheader + '\n')
writer = csv.DictWriter(f, fieldnames=header, quoting=csv.QUOTE_ALL)
writer.writeheader()
for record in records:
writer.writerow(record)
with open(parent + '.complete_folders.txt', 'w') as f:
for folder in folders:
f.write(folder + '\n')
max_batch_size = 500 # @TODO this needs to become an integer input defaulted to 500
total = len(records)
count = math.ceil(float(total) / max_batch_size )
if count == 0:
print('WARNING: There are no records of datatype '.format(basename))
batch_size = 0
else:
batch_size = math.ceil(float(total) / count )
low = 0
subprocess.call(('echo `date` Creating batch files'), shell=True)
for i in range(1, count+1):
if i < count or total == batch_size:
B = batch_size
else:
B = total % batch_size
records_subset = records[ low : (low + B) ]
folders_subset = folders[ low : (low + B) ]
low = i * batch_size
batchname = '_'.join([ str(total), str(max_batch_size), str(i) ])
records_batch = parent + '.records_' + batchname + '.csv'
folders_batch = parent + '.folders_' + batchname + '.txt'
with open(records_batch, 'w') as f:
f.write(ndaheader + '\n')
writer = csv.DictWriter(f, fieldnames=header, quoting=csv.QUOTE_ALL)
writer.writeheader()
for record in records_subset:
writer.writerow(record)
with open(folders_batch, 'w') as f:
for folder in folders_subset:
f.write(folder + '\n')
print("FINISHED " + basename + " RECORDS PREPARATION.")
if __name__ == "__main__":
# command line interface parse
parser = generate_parser()
args = parser.parse_args()
records_sanity_check(args.parent, args.lookup_csv, args.yaml_dir)
cli(args.parent, args.lookup_csv, args.yaml_dir)
sys.exit(0)