Skip to content

Commit 117cd49

Browse files
authored
Merge pull request #109 from icgc-argo/payload-gen-variant-filtering@0.1.0
[release]
2 parents 28a0e5e + aa89a09 commit 117cd49

27 files changed

Lines changed: 1081 additions & 0 deletions
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
.gitignore
2+
.nextflow*
3+
tests
4+
work
5+
outdir
Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
*.py[cod]
2+
3+
# C extensions
4+
*.so
5+
6+
# Packages
7+
*.egg
8+
*.egg-info
9+
dist
10+
build
11+
eggs
12+
.eggs
13+
parts
14+
bin
15+
var
16+
sdist
17+
develop-eggs
18+
.installed.cfg
19+
lib
20+
lib64
21+
venv*/
22+
pyvenv*/
23+
24+
# Installer logs
25+
pip-log.txt
26+
27+
# Unit test / coverage reports
28+
.coverage
29+
.tox
30+
.coverage.*
31+
nosetests.xml
32+
coverage.xml
33+
htmlcov
34+
35+
# Translations
36+
*.mo
37+
38+
# Mr Developer
39+
.mr.developer.cfg
40+
.project
41+
.pydevproject
42+
.idea
43+
*.iml
44+
*.komodoproject
45+
46+
# Complexity
47+
output/*.html
48+
output/*/index.html
49+
50+
# Sphinx
51+
docs/_build
52+
53+
.DS_Store
54+
*~
55+
.*.sw[po]
56+
.build
57+
.ve
58+
.env
59+
.cache
60+
.pytest
61+
.bootstrap
62+
.appveyor.token
63+
*.bak
64+
*.log
65+
.vscode
66+
.python-version
67+
.nextflow*
68+
work
69+
outdir
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
FROM python:3.7.5-slim-buster
2+
3+
LABEL org.opencontainers.image.source https://github.com/icgc-argo/data-processing-utility-tools
4+
5+
RUN groupadd -g 1000 ubuntu &&\
6+
useradd -l -u 1000 -g ubuntu ubuntu &&\
7+
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
8+
9+
ENV PATH="/tools:${PATH}"
10+
11+
COPY *.py /tools/
12+
13+
USER ubuntu
14+
15+
CMD ["/bin/bash"]
Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,103 @@
1+
#!/usr/bin/env nextflow
2+
3+
/*
4+
Copyright (C) 2021, Ontario Institute for Cancer Research
5+
6+
This program is free software: you can redistribute it and/or modify
7+
it under the terms of the GNU Affero General Public License as published by
8+
the Free Software Foundation, either version 3 of the License, or
9+
(at your option) any later version.
10+
11+
This program is distributed in the hope that it will be useful,
12+
but WITHOUT ANY WARRANTY; without even the implied warranty of
13+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14+
GNU Affero General Public License for more details.
15+
16+
You should have received a copy of the GNU Affero General Public License
17+
along with this program. If not, see <http://www.gnu.org/licenses/>.
18+
19+
Authors:
20+
Linda Xiang
21+
*/
22+
23+
/********************************************************************/
24+
/* this block is auto-generated based on info from pkg.json where */
25+
/* changes can be made if needed, do NOT modify this block manually */
26+
nextflow.enable.dsl = 2
27+
version = '0.1.0' // package version
28+
29+
container = [
30+
'ghcr.io': 'ghcr.io/icgc-argo/data-processing-utility-tools.payload-gen-variant-filtering'
31+
]
32+
default_container_registry = 'ghcr.io'
33+
/********************************************************************/
34+
35+
36+
// universal params go here
37+
params.container_registry = ""
38+
params.container_version = ""
39+
params.container = ""
40+
41+
params.cpus = 1
42+
params.mem = 1 // GB
43+
params.publish_dir = "" // set to empty string will disable publishDir
44+
45+
46+
// tool specific parmas go here, add / change as needed
47+
params.analysis = ""
48+
params.files_to_upload = []
49+
params.wf_name = ""
50+
params.wf_short_name = ""
51+
params.wf_version = ""
52+
params.controlled = false
53+
54+
55+
process payloadGenVariantFiltering {
56+
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
57+
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
58+
59+
cpus params.cpus
60+
memory "${params.mem} GB"
61+
62+
input: // input, make update as needed
63+
path analysis
64+
path files_to_upload
65+
val wf_name
66+
val wf_short_name
67+
val wf_version
68+
val controlled
69+
70+
output: // output, make update as needed
71+
path "*.payload.json", emit: payload
72+
73+
script:
74+
// add and initialize variables here as needed
75+
arg_controlled = controlled ? "-c" : ""
76+
77+
"""
78+
main.py \
79+
-a ${analysis} \
80+
-f ${files_to_upload} \
81+
-w ${wf_name} \
82+
-s ${wf_short_name} \
83+
-v ${wf_version} \
84+
-r ${workflow.runName} \
85+
-j ${workflow.sessionId} \
86+
${arg_controlled}
87+
88+
"""
89+
}
90+
91+
92+
// this provides an entry point for this main script, so it can be run directly without clone the repo
93+
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
94+
workflow {
95+
payloadGenVariantFiltering(
96+
file(params.analysis),
97+
Channel.fromPath(params.files_to_upload).collect(),
98+
params.wf_name,
99+
params.wf_short_name,
100+
params.wf_version,
101+
params.controlled
102+
)
103+
}
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Copyright (C) 2021, Ontario Institute for Cancer Research
6+
7+
This program is free software: you can redistribute it and/or modify
8+
it under the terms of the GNU Affero General Public License as published by
9+
the Free Software Foundation, either version 3 of the License, or
10+
(at your option) any later version.
11+
12+
This program is distributed in the hope that it will be useful,
13+
but WITHOUT ANY WARRANTY; without even the implied warranty of
14+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15+
GNU Affero General Public License for more details.
16+
17+
You should have received a copy of the GNU Affero General Public License
18+
along with this program. If not, see <http://www.gnu.org/licenses/>.
19+
20+
Authors:
21+
Linda Xiang
22+
"""
23+
24+
import os
25+
import argparse
26+
import json
27+
import uuid
28+
import hashlib
29+
import copy
30+
31+
variant_type_to_data_type_etc = {
32+
'snv': ['Simple Nucleotide Variation', 'Raw SNV Calls', ['CaVEMan', 'bcftools'], ['GATK-Mutect2', 'bcftools']], # dataCategory, dataType, analysis_tools
33+
'indel': ['Simple Nucleotide Variation', 'Raw InDel Calls', ['Pindel', 'bcftools'], ['GATK-Mutect2', 'bcftools']]
34+
}
35+
36+
workflow_full_name = {
37+
'open-access-variant-filtering': 'Open Access Variant Filtering'
38+
}
39+
40+
def calculate_size(file_path):
41+
return os.stat(file_path).st_size
42+
43+
44+
def calculate_md5(file_path):
45+
md5 = hashlib.md5()
46+
with open(file_path, 'rb') as f:
47+
for chunk in iter(lambda: f.read(1024 * 1024), b''):
48+
md5.update(chunk)
49+
return md5.hexdigest()
50+
51+
52+
def get_files_info(file_to_upload, args, input_wf, variant_type):
53+
basename = os.path.basename(file_to_upload).replace("filtered", args.wf_short_name)
54+
55+
file_info = {
56+
'fileName': basename,
57+
'fileType': 'VCF' if basename.endswith('.vcf.gz') else basename.split(".")[-1].upper(),
58+
'fileSize': calculate_size(file_to_upload),
59+
'fileMd5sum': calculate_md5(file_to_upload),
60+
'fileAccess': 'open' if not args.controlled else 'controlled',
61+
'info': {
62+
'data_category': variant_type_to_data_type_etc[variant_type][0]
63+
}
64+
}
65+
66+
if file_to_upload.endswith('.vcf.gz'):
67+
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
68+
elif file_to_upload.endswith('.vcf.gz.tbi'):
69+
file_info['dataType'] = 'VCF Index'
70+
else:
71+
pass
72+
73+
if input_wf in (['sanger-wgs', 'sanger-wxs']):
74+
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][2]
75+
elif input_wf in (['gatk-mutect2']):
76+
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][3]
77+
78+
return file_info
79+
80+
def get_sample_info(sample_list):
81+
samples = copy.deepcopy(sample_list)
82+
for sample in samples:
83+
for item in ['info', 'sampleId', 'specimenId', 'donorId', 'studyId']:
84+
sample.pop(item, None)
85+
sample['specimen'].pop(item, None)
86+
sample['donor'].pop(item, None)
87+
88+
return samples
89+
90+
def get_variant_type(analysis):
91+
for f in analysis.get('files'):
92+
if f.get('dataType') == "VCF Index": continue
93+
if f.get('dataType') == "Raw SNV Calls":
94+
variant_type = 'snv'
95+
elif f.get('dataType') == "Raw InDel Calls":
96+
variant_type = 'indel'
97+
98+
return variant_type
99+
100+
def main():
101+
"""
102+
Python implementation of tool: payload-gen-variant-filtering
103+
"""
104+
105+
parser = argparse.ArgumentParser(description='Tool: payload-gen-variant-filtering')
106+
parser.add_argument("-a", dest="analysis", required=True,
107+
help="json file containing sequencing_alignment SONG analysis for tumour sample")
108+
parser.add_argument("-f", dest="files_to_upload", type=str, nargs="+", help="Files to be uploaded", required=True)
109+
parser.add_argument("-w", dest="wf_name", type=str, help="workflow full name", required=True)
110+
parser.add_argument("-s", dest="wf_short_name", type=str, help="workflow short name", required=True)
111+
parser.add_argument("-v", dest="wf_version", type=str, required=True, help="workflow version")
112+
parser.add_argument("-r", dest="wf_run", type=str, required=True, help="workflow run ID")
113+
parser.add_argument("-j", dest="wf_session", type=str, required=True, help="workflow session ID")
114+
parser.add_argument("-c", dest="controlled", action='store_true', help="set file to be controlled access")
115+
args = parser.parse_args()
116+
117+
analysis = {}
118+
with open(args.analysis, 'r') as f:
119+
analysis = json.load(f)
120+
121+
input_analysis_type = analysis.get('analysisType').get('name')
122+
input_wf = analysis.get('workflow', {}).get('workflow_short_name')
123+
variant_type = get_variant_type(analysis)
124+
125+
output_analysis_type = "variant_filtering"
126+
payload = {
127+
'analysisType': {
128+
'name': output_analysis_type
129+
},
130+
'studyId': analysis.get('studyId'),
131+
'experiment': analysis.get('experiment'),
132+
'samples': get_sample_info(analysis.get('samples')),
133+
'files': [],
134+
'workflow': {
135+
'workflow_name': workflow_full_name.get(args.wf_name),
136+
'workflow_short_name': args.wf_short_name,
137+
'workflow_version': args.wf_version,
138+
'run_id': args.wf_run,
139+
'session_id': args.wf_session,
140+
'inputs': [
141+
{
142+
'input_analysis_id': analysis.get('analysisId'),
143+
'analysis_type': input_analysis_type
144+
}
145+
],
146+
'genome_build': 'GRCh38_hla_decoy_ebv'
147+
},
148+
'variant_class': analysis.get('variant_class')
149+
}
150+
151+
for f in args.files_to_upload:
152+
file_info = get_files_info(f, args, input_wf, variant_type)
153+
payload['files'].append(file_info)
154+
155+
with open("%s.%s.payload.json" % (str(uuid.uuid4()), output_analysis_type), 'w') as f:
156+
f.write(json.dumps(payload, indent=2))
157+
158+
159+
if __name__ == "__main__":
160+
main()
161+
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
docker {
2+
enabled = true
3+
runOptions = '-u \$(id -u):\$(id -g)'
4+
}

0 commit comments

Comments
 (0)