Skip to content

Commit 37e45b6

Browse files
committed
add tool and test cases
1 parent ec9967a commit 37e45b6

19 files changed

Lines changed: 780 additions & 60 deletions

payload-gen-variant-processing/Dockerfile

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,8 +2,14 @@ FROM python:3.7.5-slim-buster
22

33
LABEL org.opencontainers.image.source https://github.com/icgc-argo/data-processing-utility-tools
44

5+
RUN groupadd -g 1000 ubuntu &&\
6+
useradd -l -u 1000 -g ubuntu ubuntu &&\
7+
install -d -m 0755 -o ubuntu -g ubuntu /home/ubuntu
8+
59
ENV PATH="/tools:${PATH}"
610

711
COPY *.py /tools/
812

13+
USER ubuntu
14+
915
CMD ["/bin/bash"]

payload-gen-variant-processing/main.nf

Lines changed: 44 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@
22

33
/*
44
Copyright (C) 2021, Ontario Institute for Cancer Research
5-
5+
66
This program is free software: you can redistribute it and/or modify
77
it under the terms of the GNU Affero General Public License as published by
88
the Free Software Foundation, either version 3 of the License, or
99
(at your option) any later version.
10-
10+
1111
This program is distributed in the hope that it will be useful,
1212
but WITHOUT ANY WARRANTY; without even the implied warranty of
1313
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1414
GNU Affero General Public License for more details.
15-
15+
1616
You should have received a copy of the GNU Affero General Public License
1717
along with this program. If not, see <http://www.gnu.org/licenses/>.
1818
@@ -24,7 +24,7 @@
2424
/* this block is auto-generated based on info from pkg.json where */
2525
/* changes can be made if needed, do NOT modify this block manually */
2626
nextflow.enable.dsl = 2
27-
version = '0.1.0' // package version
27+
version = '0.1.0'
2828

2929
container = [
3030
'ghcr.io': 'ghcr.io/icgc-argo/data-processing-utility-tools.payload-gen-variant-processing'
@@ -44,32 +44,53 @@ params.publish_dir = "" // set to empty string will disable publishDir
4444

4545

4646
// tool specific parmas go here, add / change as needed
47-
params.input_file = ""
48-
params.output_pattern = "*" // output file name pattern
47+
params.analysis = ""
48+
params.files_to_upload = []
49+
params.wf_name = ""
50+
params.wf_short_name = ""
51+
params.wf_version = ""
52+
params.controlled = false
53+
params.process_name = ""
54+
params.analysis_type = ""
4955

5056

51-
process payloadGenVariantProcessing {
57+
process payloadGenVariantFiltering {
5258
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
5359
publishDir "${params.publish_dir}/${task.process.replaceAll(':', '_')}", mode: "copy", enabled: params.publish_dir
5460

5561
cpus params.cpus
5662
memory "${params.mem} GB"
5763

5864
input: // input, make update as needed
59-
path input_file
65+
path analysis
66+
path files_to_upload
67+
val wf_name
68+
val wf_short_name
69+
val wf_version
70+
val controlled
71+
val process_name
72+
val analysis_type
6073

6174
output: // output, make update as needed
62-
path "output_dir/${params.output_pattern}", emit: output_file
75+
path "*.payload.json", emit: payload
76+
path "out/*{.vcf.gz,.vcf.gz.tbi}", emit: files_to_upload
6377

6478
script:
6579
// add and initialize variables here as needed
80+
arg_controlled = controlled ? "-c" : ""
81+
arg_process_name = process_name ? "-p ${process_name}" : ""
82+
arg_analysis_type = analysis_type ? "-t ${analysis_type}" : ""
6683

67-
"""
68-
mkdir -p output_dir
69-
84+
"""
7085
main.py \
71-
-i ${input_file} \
72-
-o output_dir
86+
-a ${analysis} \
87+
-f ${files_to_upload} \
88+
-w ${wf_name} \
89+
-s ${wf_short_name} \
90+
-v ${wf_version} \
91+
-r ${workflow.runName} \
92+
-j ${workflow.sessionId} \
93+
${arg_controlled} ${arg_process_name} ${arg_analysis_type}
7394
7495
"""
7596
}
@@ -78,7 +99,14 @@ process payloadGenVariantProcessing {
7899
// this provides an entry point for this main script, so it can be run directly without clone the repo
79100
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
80101
workflow {
81-
payloadGenVariantProcessing(
82-
file(params.input_file)
102+
payloadGenVariantFiltering(
103+
file(params.analysis),
104+
Channel.fromPath(params.files_to_upload).collect(),
105+
params.wf_name,
106+
params.wf_short_name,
107+
params.wf_version,
108+
params.controlled,
109+
params.process_name,
110+
params.analysis_type
83111
)
84112
}

payload-gen-variant-processing/main.py

Lines changed: 127 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,17 +3,17 @@
33

44
"""
55
Copyright (C) 2021, Ontario Institute for Cancer Research
6-
6+
77
This program is free software: you can redistribute it and/or modify
88
it under the terms of the GNU Affero General Public License as published by
99
the Free Software Foundation, either version 3 of the License, or
1010
(at your option) any later version.
11-
11+
1212
This program is distributed in the hope that it will be useful,
1313
but WITHOUT ANY WARRANTY; without even the implied warranty of
1414
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
1515
GNU Affero General Public License for more details.
16-
16+
1717
You should have received a copy of the GNU Affero General Public License
1818
along with this program. If not, see <http://www.gnu.org/licenses/>.
1919
@@ -22,32 +22,140 @@
2222
"""
2323

2424
import os
25-
import sys
2625
import argparse
27-
import subprocess
28-
26+
import json
27+
import uuid
28+
import hashlib
29+
import copy
30+
31+
variant_type_to_data_type_etc = {
32+
'snv': ['Simple Nucleotide Variation', 'Raw SNV Calls', ['CaVEMan', 'bcftools'], ['GATK-Mutect2', 'bcftools']], # dataCategory, dataType, analysis_tools
33+
'indel': ['Simple Nucleotide Variation', 'Raw InDel Calls', ['Pindel', 'bcftools'], ['GATK-Mutect2', 'bcftools']]
34+
}
35+
36+
workflow_full_name = {
37+
'open-access-variant-filtering': 'Open Access Variant Filtering'
38+
}
39+
40+
def calculate_size(file_path):
41+
return os.stat(file_path).st_size
42+
43+
44+
def calculate_md5(file_path):
45+
md5 = hashlib.md5()
46+
with open(file_path, 'rb') as f:
47+
for chunk in iter(lambda: f.read(1024 * 1024), b''):
48+
md5.update(chunk)
49+
return md5.hexdigest()
50+
51+
52+
def get_files_info(file_to_upload, args):
53+
fname = os.path.basename(file_to_upload).split(".")
54+
input_wf = fname[5]
55+
variant_type = fname[7]
56+
process_name = args.process_name if args.process_name else args.wf_short_name
57+
new_fname = ".".join(fname[0:8]+[process_name, 'vcf.gz']+(['tbi'] if file_to_upload.endswith('.tbi') else []))
58+
file_info = {
59+
'fileName': new_fname,
60+
'fileType': 'VCF' if new_fname.endswith('.vcf.gz') else new_fname.split(".")[-1].upper(),
61+
'fileSize': calculate_size(file_to_upload),
62+
'fileMd5sum': calculate_md5(file_to_upload),
63+
'fileAccess': 'open' if not args.controlled else 'controlled',
64+
'info': {
65+
'data_category': variant_type_to_data_type_etc[variant_type][0]
66+
}
67+
}
68+
69+
if file_to_upload.endswith('.vcf.gz'):
70+
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
71+
elif file_to_upload.endswith('.vcf.gz.tbi'):
72+
file_info['dataType'] = 'VCF Index'
73+
else:
74+
pass
75+
76+
if input_wf in (['sanger-wgs', 'sanger-wxs']):
77+
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][2]
78+
elif input_wf in (['gatk-mutect2']):
79+
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][3]
80+
81+
new_dir = 'out'
82+
try:
83+
os.mkdir(new_dir)
84+
except FileExistsError:
85+
pass
86+
87+
dst = os.path.join(os.getcwd(), new_dir, new_fname)
88+
print(dst)
89+
os.symlink(os.path.abspath(file_to_upload), dst)
90+
91+
return file_info
92+
93+
def get_sample_info(sample_list):
94+
samples = copy.deepcopy(sample_list)
95+
for sample in samples:
96+
for item in ['info', 'sampleId', 'specimenId', 'donorId', 'studyId']:
97+
sample.pop(item, None)
98+
sample['specimen'].pop(item, None)
99+
sample['donor'].pop(item, None)
100+
101+
return samples
29102

30103
def main():
31104
"""
32105
Python implementation of tool: payload-gen-variant-processing
33-
34-
This is auto-generated Python code, please update as needed!
35106
"""
36107

37108
parser = argparse.ArgumentParser(description='Tool: payload-gen-variant-processing')
38-
parser.add_argument('-i', '--input-file', dest='input_file', type=str,
39-
help='Input file', required=True)
40-
parser.add_argument('-o', '--output-dir', dest='output_dir', type=str,
41-
help='Output directory', required=True)
109+
parser.add_argument("-a", dest="analysis", required=True,
110+
help="json file containing sequencing_alignment SONG analysis for tumour sample")
111+
parser.add_argument("-f", dest="files_to_upload", type=str, nargs="+", help="Files to be uploaded", required=True)
112+
parser.add_argument("-w", dest="wf_name", type=str, help="workflow full name", required=True)
113+
parser.add_argument("-s", dest="wf_short_name", type=str, help="workflow short name", required=True)
114+
parser.add_argument("-v", dest="wf_version", type=str, required=True, help="workflow version")
115+
parser.add_argument("-r", dest="wf_run", type=str, required=True, help="workflow run ID")
116+
parser.add_argument("-j", dest="wf_session", type=str, required=True, help="workflow session ID")
117+
parser.add_argument("-c", dest="controlled", action='store_true', help="set file to be controlled access")
118+
parser.add_argument("-p", dest="process_name", type=str, help="variant process name")
119+
parser.add_argument("-t", dest="analysis_type", type=str, default="variant_processing", help="specify output song analysis type")
42120
args = parser.parse_args()
43121

44-
if not os.path.isfile(args.input_file):
45-
sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_file)
46-
47-
if not os.path.isdir(args.output_dir):
48-
sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir)
49-
50-
subprocess.run(f"cp {args.input_file} {args.output_dir}/", shell=True, check=True)
122+
analysis = {}
123+
with open(args.analysis, 'r') as f:
124+
analysis = json.load(f)
125+
126+
input_analysis_type = analysis.get('analysisType').get('name')
127+
128+
payload = {
129+
'analysisType': {
130+
'name': args.analysis_type
131+
},
132+
'studyId': analysis.get('studyId'),
133+
'experiment': analysis.get('experiment'),
134+
'samples': get_sample_info(analysis.get('samples')),
135+
'files': [],
136+
'workflow': {
137+
'workflow_name': workflow_full_name.get(args.wf_name),
138+
'workflow_short_name': args.wf_short_name,
139+
'workflow_version': args.wf_version,
140+
'run_id': args.wf_run,
141+
'session_id': args.wf_session,
142+
'inputs': [
143+
{
144+
'input_analysis_id': analysis.get('analysisId'),
145+
'analysis_type': input_analysis_type
146+
}
147+
],
148+
'genome_build': 'GRCh38_hla_decoy_ebv'
149+
},
150+
'variant_class': analysis.get('variant_class')
151+
}
152+
153+
for f in args.files_to_upload:
154+
file_info = get_files_info(f, args)
155+
payload['files'].append(file_info)
156+
157+
with open("%s.%s.payload.json" % (str(uuid.uuid4()), args.analysis_type), 'w') as f:
158+
f.write(json.dumps(payload, indent=2))
51159

52160

53161
if __name__ == "__main__":

0 commit comments

Comments
 (0)