Skip to content

Commit f2bba84

Browse files
committed
Merge branch 'main' into helper-functions@1.0.1
2 parents 16e64a2 + 1c8dc76 commit f2bba84

86 files changed

Lines changed: 3085 additions & 7 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.gitignore
2+
.nextflow*
3+
tests
4+
work
5+
outdir

payload-add-uniform-ids/.gitignore

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
*.py[cod]
2+
3+
# C extensions
4+
*.so
5+
6+
# Packages
7+
*.egg
8+
*.egg-info
9+
dist
10+
build
11+
eggs
12+
.eggs
13+
parts
14+
bin
15+
var
16+
sdist
17+
develop-eggs
18+
.installed.cfg
19+
lib
20+
lib64
21+
venv*/
22+
pyvenv*/
23+
24+
# Installer logs
25+
pip-log.txt
26+
27+
# Unit test / coverage reports
28+
.coverage
29+
.tox
30+
.coverage.*
31+
nosetests.xml
32+
coverage.xml
33+
htmlcov
34+
35+
# Translations
36+
*.mo
37+
38+
# Mr Developer
39+
.mr.developer.cfg
40+
.project
41+
.pydevproject
42+
.idea
43+
*.iml
44+
*.komodoproject
45+
46+
# Complexity
47+
output/*.html
48+
output/*/index.html
49+
50+
# Sphinx
51+
docs/_build
52+
53+
.DS_Store
54+
*~
55+
.*.sw[po]
56+
.build
57+
.ve
58+
.env
59+
.cache
60+
.pytest
61+
.bootstrap
62+
.appveyor.token
63+
*.bak
64+
*.log
65+
.vscode
66+
.python-version
67+
.nextflow*
68+
work
69+
outdir

payload-add-uniform-ids/Dockerfile

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM python:3.7.5-slim-buster
2+
3+
LABEL org.opencontainers.image.source https://github.com/icgc-argo/data-processing-utility-tools
4+
5+
ENV PATH="/tools:${PATH}"
6+
7+
COPY *.py /tools/
8+
9+
RUN groupadd -g 1000 ubuntu && \
10+
useradd -l -u 1000 -g ubuntu ubuntu && \
11+
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
12+
13+
USER ubuntu
14+
15+
CMD ["/bin/bash"]

payload-add-uniform-ids/main.nf

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
#!/usr/bin/env nextflow
2+
3+
/*
4+
Copyright (C) 2021, Ontario Institute for Cancer Research
5+
6+
This program is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU Affero General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
This program is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU Affero General Public License for more details.
15+
16+
You should have received a copy of the GNU Affero General Public License
17+
along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
19+
Authors:
20+
Junjun Zhang
21+
*/
22+
23+
/********************************************************************/
24+
/* this block is auto-generated based on info from pkg.json where */
25+
/* changes can be made if needed, do NOT modify this block manually */
26+
nextflow.enable.dsl = 2
27+
version = '0.1.1'
28+
29+
container = [
30+
'ghcr.io': 'ghcr.io/icgc-argo/data-processing-utility-tools.payload-add-uniform-ids'
31+
]
32+
default_container_registry = 'ghcr.io'
33+
/********************************************************************/
34+
35+
36+
// universal params go here
37+
params.container_registry = ""
38+
params.container_version = ""
39+
params.container = ""
40+
41+
params.cpus = 1
42+
params.mem = 1 // GB
43+
params.publish_dir = "" // set to empty string will disable publishDir
44+
45+
46+
// tool specific parmas go here, add / change as needed
47+
params.payload_json = ""
48+
params.id_mapping_tsv = ""
49+
50+
51+
process payloadAddUniformIds {
52+
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
53+
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
54+
55+
cpus params.cpus
56+
memory "${params.mem} GB"
57+
58+
input: // input, make update as needed
59+
path payload_json
60+
path id_mapping_tsv
61+
62+
output: // output, make update as needed
63+
path "output_dir/*.json", emit: payload
64+
65+
script:
66+
// add and initialize variables here as needed
67+
68+
"""
69+
mkdir -p output_dir
70+
71+
main.py \
72+
-p ${payload_json} \
73+
-i ${id_mapping_tsv} \
74+
-o output_dir
75+
76+
"""
77+
}
78+
79+
80+
// this provides an entry point for this main script, so it can be run directly without clone the repo
81+
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
82+
workflow {
83+
payloadAddUniformIds(
84+
file(params.payload_json),
85+
file(params.id_mapping_tsv)
86+
)
87+
}

payload-add-uniform-ids/main.py

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Copyright (C) 2021, Ontario Institute for Cancer Research
6+
7+
This program is free software: you can redistribute it and/or modify
8+
it under the terms of the GNU Affero General Public License as published by
9+
the Free Software Foundation, either version 3 of the License, or
10+
(at your option) any later version.
11+
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU Affero General Public License for more details.
16+
17+
You should have received a copy of the GNU Affero General Public License
18+
along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
20+
Authors:
21+
Junjun Zhang
22+
"""
23+
24+
import os
25+
import sys
26+
import argparse
27+
import csv
28+
import json
29+
30+
31+
def get_id_mapping(id_mapping_tsv):
32+
id_mapping = dict()
33+
with open(id_mapping_tsv, 'r') as f:
34+
for row in csv.DictReader(f, delimiter='\t'):
35+
type = row['type']
36+
submitter_id = row['submitter_id']
37+
uniform_id = row['uniform_id']
38+
if type in id_mapping:
39+
sys.exit(f"Values in 'type' field duplicated. Offending value: {type}, in file: {id_mapping_tsv}")
40+
else:
41+
id_mapping[type] = dict()
42+
43+
if submitter_id in id_mapping[type]:
44+
sys.exit(f"Values in 'submitter_id' field duplicated. Offending value: {submitter_id}, for type: {type}, in file: {id_mapping_tsv}" )
45+
else:
46+
id_mapping[type][submitter_id] = uniform_id
47+
48+
if 'donor' not in id_mapping or 'specimen' not in id_mapping or 'sample' not in id_mapping:
49+
sys.exit(f"Provided id_mapping_tsv file '{id_mapping_tsv}' is required to have ID mappings for 'donor', 'specimen' and 'sample'")
50+
51+
return id_mapping
52+
53+
54+
def add_uniform_ids(payload, id_mapping):
55+
samples = payload.pop('samples', [])
56+
if not samples:
57+
sys.exit("Error: no 'samples' found in the input payload JSON")
58+
59+
updated_samples = []
60+
for sample in samples:
61+
if id_mapping['sample'].get(sample['submitterSampleId']):
62+
sample['sampleId'] = id_mapping['sample'][sample['submitterSampleId']]
63+
else:
64+
sys.exit(f"Provided id_mapping_tsv misses mapping for submitter sample ID: {sample['submitterSampleId']}")
65+
66+
if id_mapping['specimen'].get(sample['specimen']['submitterSpecimenId']):
67+
sample['specimenId'] = id_mapping['specimen'][sample['specimen']['submitterSpecimenId']]
68+
sample['specimen']['specimenId'] = sample["specimenId"]
69+
else:
70+
sys.exit(f"Provided id_mapping_tsv misses mapping for submitter specimen ID: {sample['specimen']['submitterSpecimenId']}")
71+
72+
if id_mapping['donor'].get(sample['donor']['submitterDonorId']):
73+
sample['donor']['donorId'] = id_mapping['donor'][sample['donor']['submitterDonorId']]
74+
sample['specimen']['donorId'] = sample['donor']['donorId']
75+
else:
76+
sys.exit(f"Provided id_mapping_tsv misses mapping for submitter donor ID: {sample['donor']['submitterDonorId']}")
77+
78+
updated_samples.append(sample)
79+
80+
payload['samples'] = updated_samples
81+
82+
83+
def main():
84+
"""
85+
Add uniform IDs for donor/specimen/sample to the original input payload JSON
86+
"""
87+
88+
parser = argparse.ArgumentParser(description='Tool: payload-add-uniform-ids')
89+
parser.add_argument('-p', '--payload-json', type=str,
90+
help='Input payload JSON', required=True)
91+
parser.add_argument('-i', '--id-mapping-tsv', type=str,
92+
help='TSV file containing mapping between submitter IDs and uniform IDs', required=True)
93+
parser.add_argument('-o', '--output-dir', type=str,
94+
help='Output directory', required=True)
95+
args = parser.parse_args()
96+
97+
if not os.path.isfile(args.payload_json):
98+
sys.exit('Error: specified input payload JSON %s does not exist or is not accessible!' % args.payload_json)
99+
100+
if not os.path.isfile(args.id_mapping_tsv):
101+
sys.exit('Error: specified ID mapping TSV %s does not exist or is not accessible!' % args.id_mapping_tsv)
102+
103+
if not os.path.isdir(args.output_dir):
104+
sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir)
105+
106+
with open(args.payload_json, 'r') as p:
107+
payload = json.loads(p.read())
108+
109+
id_mapping = get_id_mapping(args.id_mapping_tsv)
110+
111+
add_uniform_ids(payload, id_mapping)
112+
113+
output_payload_file = "%s.uniform_id_added.json" % os.path.splitext(os.path.basename(args.payload_json))[0]
114+
115+
with open(os.path.join(args.output_dir, output_payload_file), 'w') as o:
116+
o.write(json.dumps(payload, indent=2))
117+
118+
119+
if __name__ == "__main__":
120+
main()
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
docker {
2+
enabled = true
3+
runOptions = '-u \$(id -u):\$(id -g)'
4+
}

payload-add-uniform-ids/pkg.json

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
{
2+
"name": "payload-add-uniform-ids",
3+
"version": "0.1.1",
4+
"description": "SONG payload utility tool for adding uniform IDs",
5+
"main": "main.nf",
6+
"deprecated": false,
7+
"keywords": [
8+
"bioinformatics",
9+
"metadata"
10+
],
11+
"repository": {
12+
"type": "git",
13+
"url": "https://github.com/icgc-argo/data-processing-utility-tools.git"
14+
},
15+
"container": {
16+
"registries": [
17+
{
18+
"registry": "ghcr.io",
19+
"type": "docker",
20+
"org": "icgc-argo",
21+
"default": true
22+
}
23+
]
24+
},
25+
"dependencies": [],
26+
"devDependencies": [],
27+
"contributors": [
28+
{
29+
"name": "Junjun Zhang",
30+
"email": "junjun.ca@gmail.com"
31+
}
32+
],
33+
"license": "GNU Affero General Public License v3",
34+
"bugReport": "https://github.com/icgc-argo/data-processing-utility-tools/issues",
35+
"homepage": "https://github.com/icgc-argo/data-processing-utility-tools#readme"
36+
}

0 commit comments

Comments
 (0)