Skip to content

Commit 8eccec7

Browse files
committed
implemented the new payload-add-uniform-ids tool
1 parent d60d9ca commit 8eccec7

19 files changed

Lines changed: 355 additions & 36 deletions

payload-add-uniform-ids/Dockerfile

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,15 @@
1-
FROM ubuntu:20.04
1+
FROM python:3.7.5-slim-buster
22

33
LABEL org.opencontainers.image.source https://github.com/icgc-argo/data-processing-utility-tools
44

55
ENV PATH="/tools:${PATH}"
66

77
COPY *.py /tools/
88

9+
RUN groupadd -g 1000 ubuntu && \
10+
useradd -l -u 1000 -g ubuntu ubuntu && \
11+
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
12+
13+
USER ubuntu
14+
915
CMD ["/bin/bash"]

payload-add-uniform-ids/main.nf

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,8 @@ params.publish_dir = "" // set to empty string will disable publishDir
4444

4545

4646
// tool specific parmas go here, add / change as needed
47-
params.input_file = ""
48-
params.output_pattern = "*" // output file name pattern
47+
params.payload_json = ""
48+
params.id_mapping_tsv = ""
4949

5050

5151
process payloadAddUniformIds {
@@ -56,10 +56,11 @@ process payloadAddUniformIds {
5656
memory "${params.mem} GB"
5757

5858
input: // input, make update as needed
59-
path input_file
59+
path payload_json
60+
path id_mapping_tsv
6061

6162
output: // output, make update as needed
62-
path "output_dir/${params.output_pattern}", emit: output_file
63+
path "output_dir/*.json", emit: payload
6364

6465
script:
6566
// add and initialize variables here as needed
@@ -68,7 +69,8 @@ process payloadAddUniformIds {
6869
mkdir -p output_dir
6970
7071
main.py \
71-
-i ${input_file} \
72+
-p ${payload_json} \
73+
-i ${id_mapping_tsv} \
7274
-o output_dir
7375
7476
"""
@@ -79,6 +81,7 @@ process payloadAddUniformIds {
7981
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
8082
workflow {
8183
payloadAddUniformIds(
82-
file(params.input_file)
84+
file(params.payload_json),
85+
file(params.id_mapping_tsv)
8386
)
8487
}

payload-add-uniform-ids/main.py

Lines changed: 76 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,32 +24,97 @@
2424
import os
2525
import sys
2626
import argparse
27-
import subprocess
27+
import csv
28+
import json
29+
30+
31+
def get_id_mapping(id_mapping_tsv):
32+
id_mapping = dict()
33+
with open(id_mapping_tsv, 'r') as f:
34+
for row in csv.DictReader(f, delimiter='\t'):
35+
type = row['type']
36+
submitter_id = row['submitter_id']
37+
uniform_id = row['uniform_id']
38+
if type in id_mapping:
39+
sys.exit(f"Values in 'type' field duplicated. Offending value: {type}, in file: {args.id_mapping_tsv}")
40+
else:
41+
id_mapping[type] = dict()
42+
43+
if submitter_id in id_mapping[type]:
44+
sys.exit(f"Values in 'submitter_id' field duplicated. Offending value: {submitter_id}, for type: {type}, in file: {args.id_mapping_tsv}" )
45+
else:
46+
id_mapping[type][submitter_id] = uniform_id
47+
48+
if 'donor' not in id_mapping or 'specimen' not in id_mapping or 'sample' not in id_mapping:
49+
sys.exit(f"Provided id_mapping_tsv file '{args.id_mapping_tsv}' is required to have ID mappings for 'donor', 'specimen' and 'sample'")
50+
51+
return id_mapping
52+
53+
54+
def add_uniform_ids(payload, id_mapping):
55+
samples = payload.pop('samples', [])
56+
if not samples:
57+
sys.exit("Error: no 'samples' found in the input payload JSON")
58+
59+
updated_samples = []
60+
for sample in samples:
61+
if id_mapping['sample'].get(sample['submitterSampleId']):
62+
sample['sampleId'] = id_mapping['sample'][sample['submitterSampleId']]
63+
else:
64+
sys.exit(f"Provided id_mapping_tsv misses mapping for submitter sample ID: {sample['submitterSampleId']}")
65+
66+
if id_mapping['specimen'].get(sample['specimen']['submitterSpecimenId']):
67+
sample['specimenId'] = id_mapping['specimen'][sample['specimen']['submitterSpecimenId']]
68+
sample['specimen']['specimenId'] = sample["specimenId"]
69+
else:
70+
sys.exit(f"Provided id_mapping_tsv misses mapping for submitter specimen ID: {sample['specimen']['submitterSpecimenId']}")
71+
72+
if id_mapping['donor'].get(sample['donor']['submitterDonorId']):
73+
sample['donor']['donorId'] = id_mapping['donor'][sample['donor']['submitterDonorId']]
74+
sample['specimen']['donorId'] = sample['donor']['donorId']
75+
else:
76+
sys.exit(f"Provided id_mapping_tsv misses mapping for submitter donor ID: {sample['donor']['submitterDonorId']}")
77+
78+
updated_samples.append(sample)
79+
80+
payload['samples'] = updated_samples
2881

2982

3083
def main():
3184
"""
32-
Python implementation of tool: payload-add-uniform-ids
33-
34-
This is auto-generated Python code, please update as needed!
85+
Add uniform IDs for donor/specimen/sample to the original input payload JSON
3586
"""
3687

3788
parser = argparse.ArgumentParser(description='Tool: payload-add-uniform-ids')
38-
parser.add_argument('-i', '--input-file', dest='input_file', type=str,
39-
help='Input file', required=True)
40-
parser.add_argument('-o', '--output-dir', dest='output_dir', type=str,
89+
parser.add_argument('-p', '--payload-json', type=str,
90+
help='Input payload JSON', required=True)
91+
parser.add_argument('-i', '--id-mapping-tsv', type=str,
92+
help='TSV file containing mapping between submitter IDs and uniform IDs', required=True)
93+
parser.add_argument('-o', '--output-dir', type=str,
4194
help='Output directory', required=True)
4295
args = parser.parse_args()
4396

44-
if not os.path.isfile(args.input_file):
45-
sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_file)
97+
if not os.path.isfile(args.payload_json):
98+
sys.exit('Error: specified input payload JSON %s does not exist or is not accessible!' % args.payload_json)
99+
100+
if not os.path.isfile(args.id_mapping_tsv):
101+
sys.exit('Error: specified ID mapping TSV %s does not exist or is not accessible!' % args.id_mapping_tsv)
46102

47103
if not os.path.isdir(args.output_dir):
48104
sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir)
49105

50-
subprocess.run(f"cp {args.input_file} {args.output_dir}/", shell=True, check=True)
106+
with open(args.payload_json, 'r') as p:
107+
payload = json.loads(p.read())
108+
109+
id_mapping = get_id_mapping(args.id_mapping_tsv)
110+
111+
add_uniform_ids(payload, id_mapping)
112+
113+
output_payload_file = "%s.uniform_id_added.json" % os.path.splitext(os.path.basename(args.payload_json))[0]
114+
115+
with open(os.path.join(args.output_dir, output_payload_file), 'w') as o:
116+
o.write(json.dumps(payload, indent=2))
51117

52118

53119
if __name__ == "__main__":
54120
main()
55-

payload-add-uniform-ids/tests/checker.nf

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,8 @@ params.container_version = ""
4343
params.container = ""
4444

4545
// tool specific parmas go here, add / change as needed
46-
params.input_file = ""
46+
params.payload_json = ""
47+
params.id_mapping_tsv = ""
4748
params.expected_output = ""
4849

4950
include { payloadAddUniformIds } from '../main'
@@ -61,37 +62,35 @@ process file_smart_diff {
6162

6263
script:
6364
"""
64-
# Note: this is only for demo purpose, please write your own 'diff' according to your own needs.
65-
# remove date field before comparison eg, <div id="header_filename">Tue 19 Jan 2021<br/>test_rg_3.bam</div>
66-
# sed -e 's#"header_filename">.*<br/>test_rg_3.bam#"header_filename"><br/>test_rg_3.bam</div>#'
67-
68-
diff <( cat ${output_file} | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
69-
<( ([[ '${expected_file}' == *.gz ]] && gunzip -c ${expected_file} || cat ${expected_file}) | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
65+
diff ${output_file} ${expected_file} \
7066
&& ( echo "Test PASSED" && exit 0 ) || ( echo "Test FAILED, output file mismatch." && exit 1 )
7167
"""
7268
}
7369

7470

7571
workflow checker {
7672
take:
77-
input_file
73+
payload_json
74+
id_mapping_tsv
7875
expected_output
7976

8077
main:
8178
payloadAddUniformIds(
82-
input_file
79+
payload_json,
80+
id_mapping_tsv
8381
)
8482

8583
file_smart_diff(
86-
payloadAddUniformIds.out.output_file,
84+
payloadAddUniformIds.out.payload,
8785
expected_output
8886
)
8987
}
9088

9189

9290
workflow {
9391
checker(
94-
file(params.input_file),
92+
file(params.payload_json),
93+
file(params.id_mapping_tsv),
9594
file(params.expected_output)
9695
)
9796
}
-14.6 KB
Binary file not shown.
Binary file not shown.
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../tests/data/cab531fd-dc75-462f-8a73-fb7fcbf73185.id_mapping.tsv
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../tests/data/cab531fd-dc75-462f-8a73-fb7fcbf73185.sequencing_experiment.payload.json
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../tests/data/dc51bfb0-c073-4abd-a0da-fb67aa73d58b.id_mapping.tsv
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
../../../tests/data/dc51bfb0-c073-4abd-a0da-fb67aa73d58b.sequencing_experiment.payload.json

0 commit comments

Comments
 (0)