Skip to content

Commit f3003d4

Browse files
committed
add tool payload-gen-variant-filtering
1 parent 5ae7101 commit f3003d4

24 files changed

Lines changed: 709 additions & 143 deletions

payload-gen-variant-filtering/main.nf

Lines changed: 24 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
along with this program. If not, see <http://www.gnu.org/licenses/>.
1818
1919
Authors:
20-
lindaxiang
20+
Linda Xiang (linda.xiang@oicr.on.ca)
2121
*/
2222

2323
/********************************************************************/
@@ -44,8 +44,11 @@ params.publish_dir = "" // set to empty string will disable publishDir
4444

4545

4646
// tool specific parmas go here, add / change as needed
47-
params.input_file = ""
48-
params.output_pattern = "*" // output file name pattern
47+
params.analysis = ""
48+
params.files_to_upload = []
49+
params.wf_name = ""
50+
params.wf_short_name = ""
51+
params.wf_version = ""
4952

5053

5154
process payloadGenVariantFiltering {
@@ -56,10 +59,14 @@ process payloadGenVariantFiltering {
5659
memory "${params.mem} GB"
5760

5861
input: // input, make update as needed
59-
path input_file
62+
path analysis
63+
path files_to_upload
64+
val wf_name
65+
val wf_short_name
66+
val wf_version
6067

6168
output: // output, make update as needed
62-
path "output_dir/${params.output_pattern}", emit: output_file
69+
path "*.payload.json", emit: payload
6370

6471
script:
6572
// add and initialize variables here as needed
@@ -68,8 +75,13 @@ process payloadGenVariantFiltering {
6875
mkdir -p output_dir
6976
7077
main.py \
71-
-i ${input_file} \
72-
-o output_dir
78+
-a ${analysis} \
79+
-f ${files_to_upload} \
80+
-w ${wf_name} \
81+
-s ${wf_short_name} \
82+
-v ${wf_version} \
83+
-r ${workflow.runName} \
84+
-j ${workflow.sessionId}
7385
7486
"""
7587
}
@@ -79,6 +91,10 @@ process payloadGenVariantFiltering {
7991
// using this command: nextflow run <git_acc>/<repo>/<pkg_name>/<main_script>.nf -r <pkg_name>.v<pkg_version> --params-file xxx
8092
workflow {
8193
payloadGenVariantFiltering(
82-
file(params.input_file)
94+
file(params.analysis),
95+
Channel.fromPath(params.files_to_upload).collect(),
96+
params.wf_name,
97+
params.wf_short_name,
98+
params.wf_version
8399
)
84100
}

payload-gen-variant-filtering/main.py

Lines changed: 102 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -18,36 +18,124 @@
1818
along with this program. If not, see <http://www.gnu.org/licenses/>.
1919
2020
Authors:
21-
lindaxiang
21+
Linda Xiang (linda.xiang@oicr.on.ca)
2222
"""
2323

2424
import os
25-
import sys
2625
import argparse
27-
import subprocess
26+
import json
27+
import uuid
28+
import hashlib
29+
import copy
2830

31+
variant_type_to_data_type_etc = {
32+
'snv': ['Simple Nucleotide Variation', 'Raw SNV Calls', ['CaVEMan', 'bcftools'], ['GATK-Mutect2', 'bcftools']], # dataCategory, dataType, analysis_tools
33+
'indel': ['Simple Nucleotide Variation', 'Raw InDel Calls', ['Pindel', 'bcftools'], ['GATK-Mutect2', 'bcftools']]
34+
}
35+
36+
def calculate_size(file_path):
37+
return os.stat(file_path).st_size
38+
39+
40+
def calculate_md5(file_path):
41+
md5 = hashlib.md5()
42+
with open(file_path, 'rb') as f:
43+
for chunk in iter(lambda: f.read(1024 * 1024), b''):
44+
md5.update(chunk)
45+
return md5.hexdigest()
46+
47+
48+
def get_files_info(file_to_upload):
49+
basename = os.path.basename(file_to_upload)
50+
input_wf = basename.split(".")[5]
51+
variant_type = basename.split(".")[8]
52+
file_info = {
53+
'fileName': basename,
54+
'fileType': 'VCF' if basename.endswith('.vcf.gz') else basename.split(".")[-1].upper(),
55+
'fileSize': calculate_size(file_to_upload),
56+
'fileMd5sum': calculate_md5(file_to_upload),
57+
'fileAccess': 'open',
58+
'info': {
59+
'data_category': variant_type_to_data_type_etc[variant_type][0]
60+
}
61+
}
62+
63+
if file_to_upload.endswith('.vcf.gz'):
64+
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
65+
elif file_to_upload.endswith('.vcf.gz.tbi'):
66+
file_info['dataType'] = 'VCF Index'
67+
else:
68+
pass
69+
70+
if input_wf in (['sanger-wgs', 'sanger-wxs']):
71+
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][2]
72+
elif input_wf in (['gatk-mutect2']):
73+
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][3]
74+
75+
return file_info
76+
77+
def get_sample_info(sample_list):
78+
samples = copy.deepcopy(sample_list)
79+
for sample in samples:
80+
for item in ['info', 'sampleId', 'specimenId', 'donorId', 'studyId']:
81+
sample.pop(item, None)
82+
sample['specimen'].pop(item, None)
83+
sample['donor'].pop(item, None)
84+
85+
return samples
2986

3087
def main():
3188
"""
3289
Python implementation of tool: payload-gen-variant-filtering
33-
34-
This is auto-generated Python code, please update as needed!
3590
"""
3691

3792
parser = argparse.ArgumentParser(description='Tool: payload-gen-variant-filtering')
38-
parser.add_argument('-i', '--input-file', dest='input_file', type=str,
39-
help='Input file', required=True)
40-
parser.add_argument('-o', '--output-dir', dest='output_dir', type=str,
41-
help='Output directory', required=True)
93+
parser.add_argument("-a", dest="analysis", required=True,
94+
help="json file containing sequencing_alignment SONG analysis for tumour sample")
95+
parser.add_argument("-f", dest="files_to_upload", type=str, nargs="+", help="Files to be uploaded", required=True)
96+
parser.add_argument("-w", dest="wf_name", type=str, help="workflow full name", required=True)
97+
parser.add_argument("-s", dest="wf_short_name", type=str, help="workflow short name", required=True)
98+
parser.add_argument("-v", dest="wf_version", type=str, required=True, help="workflow version")
99+
parser.add_argument("-r", dest="wf_run", type=str, required=True, help="workflow run ID")
100+
parser.add_argument("-j", dest="wf_session", type=str, required=True, help="workflow session ID")
42101
args = parser.parse_args()
43102

44-
if not os.path.isfile(args.input_file):
45-
sys.exit('Error: specified input file %s does not exist or is not accessible!' % args.input_file)
103+
analysis = {}
104+
with open(args.analysis, 'r') as f:
105+
analysis = json.load(f)
106+
107+
analysis_type = analysis.get('analysisType').get('name')
108+
payload = {
109+
'analysisType': {
110+
'name': analysis_type
111+
},
112+
'studyId': analysis.get('studyId'),
113+
'experiment': analysis.get('experiment'),
114+
'samples': get_sample_info(analysis.get('samples')),
115+
'files': [],
116+
'workflow': {
117+
'workflow_name': 'Open Access Variant Filtering',
118+
'workflow_short_name': args.wf_short_name,
119+
'workflow_version': args.wf_version,
120+
'run_id': args.wf_run,
121+
'session_id': args.wf_session,
122+
'inputs': [
123+
{
124+
'input_analysis_id': analysis.get('analysisId'),
125+
'analysis_type': analysis_type
126+
}
127+
],
128+
'genome_build': 'GRCh38_hla_decoy_ebv'
129+
},
130+
'variant_class': analysis.get('variant_class')
131+
}
46132

47-
if not os.path.isdir(args.output_dir):
48-
sys.exit('Error: specified output dir %s does not exist or is not accessible!' % args.output_dir)
133+
for f in args.files_to_upload:
134+
file_info = get_files_info(f)
135+
payload['files'].append(file_info)
49136

50-
subprocess.run(f"cp {args.input_file} {args.output_dir}/", shell=True, check=True)
137+
with open("%s.%s.payload.json" % (str(uuid.uuid4()), analysis_type), 'w') as f:
138+
f.write(json.dumps(payload, indent=2))
51139

52140

53141
if __name__ == "__main__":

payload-gen-variant-filtering/payload-gen-variant-filtering.nf

Lines changed: 0 additions & 64 deletions
This file was deleted.

payload-gen-variant-filtering/payload-gen-variant-filtering.py

Lines changed: 0 additions & 34 deletions
This file was deleted.

payload-gen-variant-filtering/tests/checker.nf

Lines changed: 24 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
along with this program. If not, see <http://www.gnu.org/licenses/>.
1818
1919
Authors:
20-
lindaxiang
20+
Linda Xiang (linda.xiang@oicr.on.ca)
2121
*/
2222

2323
/*
@@ -43,16 +43,15 @@ params.container_version = ""
4343
params.container = ""
4444

4545
// tool specific parmas go here, add / change as needed
46-
params.input_file = ""
46+
params.analysis = ""
47+
params.files_to_upload = []
48+
params.wf_name = ""
49+
params.wf_short_name = ""
50+
params.wf_version = ""
4751
params.expected_output = ""
4852

4953
include { payloadGenVariantFiltering } from '../main'
5054

51-
Channel
52-
.fromPath(params.input_file, checkIfExists: true)
53-
.set { input_file }
54-
55-
5655
process file_smart_diff {
5756
container "${params.container ?: container[params.container_registry ?: default_container_registry]}:${params.container_version ?: version}"
5857

@@ -65,37 +64,45 @@ process file_smart_diff {
6564

6665
script:
6766
"""
68-
# Note: this is only for demo purpose, please write your own 'diff' according to your own needs.
69-
# remove date field before comparison eg, <div id="header_filename">Tue 19 Jan 2021<br/>test_rg_3.bam</div>
70-
# sed -e 's#"header_filename">.*<br/>test_rg_3.bam#"header_filename"><br/>test_rg_3.bam</div>#'
71-
72-
diff <( cat ${output_file} | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
73-
<( ([[ '${expected_file}' == *.gz ]] && gunzip -c ${expected_file} || cat ${expected_file}) | sed -e 's#"header_filename">.*<br/>#"header_filename"><br/>#' ) \
67+
diff <( cat ${output_file} |sort | sed '/\"run_id\"/d' | sed '/\"session_id\"/d' ) \
68+
<( cat ${expected_file} |sort | sed '/\"run_id\"/d' | sed '/\"session_id\"/d' ) \
7469
&& ( echo "Test PASSED" && exit 0 ) || ( echo "Test FAILED, output file mismatch." && exit 1 )
7570
"""
7671
}
7772

7873

7974
workflow checker {
8075
take:
81-
input_file
76+
analysis
77+
files_to_upload
78+
wf_name
79+
wf_short_name
80+
wf_version
8281
expected_output
8382

8483
main:
8584
payloadGenVariantFiltering(
86-
input_file
85+
analysis,
86+
files_to_upload,
87+
wf_name,
88+
wf_short_name,
89+
wf_version
8790
)
8891

8992
file_smart_diff(
90-
payloadGenVariantFiltering.out.output_file,
93+
payloadGenVariantFiltering.out.payload,
9194
expected_output
9295
)
9396
}
9497

9598

9699
workflow {
97100
checker(
98-
file(params.input_file),
101+
file(params.analysis),
102+
Channel.fromPath(params.files_to_upload).collect(),
103+
params.wf_name,
104+
params.wf_short_name,
105+
params.wf_version,
99106
file(params.expected_output)
100107
)
101108
}

0 commit comments

Comments
 (0)