Skip to content

Commit e9ff98c

Browse files
committed
updated data category, type, subtype, tools to latest terminologies
1 parent e777468 commit e9ff98c

8 files changed

Lines changed: 75 additions & 61 deletions

payload-gen-variant-calling/main.py

Lines changed: 28 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -33,21 +33,21 @@
3333

3434

3535
variant_type_to_data_type_etc = {
36-
'snv': ['Simple Nucleotide Variation', 'Raw SNV Calls', ['CaVEMan'], ['GATK-Mutect2']], # dataCategory, dataType, analysis_tools
37-
'indel': ['Simple Nucleotide Variation', 'Raw InDel Calls', ['Pindel'], ['GATK-Mutect2']],
38-
'cnv': ['Copy Number Variation', 'Raw CNV Calls', ['ASCAT']],
39-
'sv': ['Structural Variation', 'Raw SV Calls', ['BRASS']],
40-
'caveman-supplement': ['Simple Nucleotide Variation', 'Variant Calling Supplement', ['CaVEMan']],
41-
'pindel-supplement': ['Simple Nucleotide Variation', 'Variant Calling Supplement', ['Pindel']],
42-
'ascat-supplement': ['Copy Number Variation', 'Variant Calling Supplement', ['ASCAT']],
43-
'brass-supplement': ['Structural Variation', 'Variant Calling Supplement', ['BRASS']],
44-
'timings-supplement': [None, 'Variant Calling Supplement', ['CaVEMan', 'Pindel', 'ASCAT', 'BRASS']],
45-
'bas_metrics': ['Quality Control Metrics', 'Alignment QC', ['bas_stats']],
46-
'contamination_metrics': ['Quality Control Metrics', 'Cross Sample Contamination', ['verifyBamHomChk'], ['GATK-CalculateContamination']],
47-
'ascat_metrics': ['Quality Control Metrics', 'Ploidy and Purity Estimation', ['ASCAT']],
48-
'genotyped_gender_metrics': ['Quality Control Metrics', 'Genotyping Inferred Gender', ['compareBamGenotypes']],
49-
'mutect_filtering_metrics': ['Quality Control Metrics', 'Mutect2 Filtering Stats', [], ['GATK-FilterMutectCalls']],
50-
'mutect_callable_metrics': ['Quality Control Metrics', 'Mutect2 Callable Stats', [], ['GATK-Mutect2']],
36+
'snv': ['Simple Nucleotide Variation', ['Raw SNV Calls', None], ['CaVEMan'], ['GATK:Mutect2']], # dataCategory, [dataType, data_subtype], analysis_tools
37+
'indel': ['Simple Nucleotide Variation', ['Raw InDel Calls', None], ['Pindel'], ['GATK:Mutect2']],
38+
'cnv': ['Copy Number Variation', ['Raw CNV Calls', None], ['ASCAT']],
39+
'sv': ['Structural Variation', ['Raw SV Calls', None], ['BRASS']],
40+
'caveman-supplement': ['Simple Nucleotide Variation', ['Variant Calling Supplement', 'SNV Supplement'], ['CaVEMan']],
41+
'pindel-supplement': ['Simple Nucleotide Variation', ['Variant Calling Supplement', 'InDel Supplement'], ['Pindel']],
42+
'ascat-supplement': ['Copy Number Variation', ['Variant Calling Supplement', 'CNV Supplement'], ['ASCAT']],
43+
'brass-supplement': ['Structural Variation', ['Variant Calling Supplement', 'SV Supplement'], ['BRASS']],
44+
'timings-supplement': ['Quality Control Metrics', ['Analysis QC', 'Runtime Stats'], None, None],
45+
'bas_metrics': ['Quality Control Metrics', ['Aligned Reads QC', 'Alignment Metrics'], ['Sanger:bam_stats']],
46+
'contamination_metrics': ['Quality Control Metrics', ['Analysis QC', 'Cross Sample Contamination'], ['Sanger:verifyBamHomChk'], ['GATK:CalculateContamination']],
47+
'ascat_metrics': ['Quality Control Metrics', ['Analysis QC', 'Ploidy and Purity Estimation'], ['ASCAT']],
48+
'genotyped_gender_metrics': ['Quality Control Metrics', ['Analysis QC', 'Genotyping Stats'], ['Sanger:compareBamGenotypes']],
49+
'mutect_filtering_metrics': ['Quality Control Metrics', ['Analysis QC', 'Variant Filtering Stats'], [], ['GATK:FilterMutectCalls']],
50+
'mutect_callable_metrics': ['Quality Control Metrics', ['Analysis QC', 'Variant Callable Stats'], [], ['GATK:Mutect2']],
5151
}
5252

5353
workflow_full_name = {
@@ -56,6 +56,7 @@
5656
'gatk-mutect2-variant-calling': 'GATK Mutect2 Variant Calling'
5757
}
5858

59+
5960
def calculate_size(file_path):
6061
return os.stat(file_path).st_size
6162

@@ -107,12 +108,6 @@ def get_files_info(file_to_upload, wf_short_name, wf_version, somatic_or_germli
107108
else:
108109
sys.exit('Error: unknown file type "%s"' % file_to_upload)
109110

110-
if wf_short_name == 'sanger-wxs':
111-
if 'ASCAT' in variant_type_to_data_type_etc['timings-supplement'][2]:
112-
variant_type_to_data_type_etc['timings-supplement'][2].remove('ASCAT')
113-
if 'BRASS' in variant_type_to_data_type_etc['timings-supplement'][2]:
114-
variant_type_to_data_type_etc['timings-supplement'][2].remove('BRASS')
115-
116111
elif wf_short_name in (['gatk-mutect2']):
117112
fname_sample_part = metadata['samples'][0]['sampleId']
118113
if file_to_upload.endswith('mutect2-snv.vcf.gz') or file_to_upload.endswith('mutect2-snv.vcf.gz.tbi'):
@@ -153,16 +148,25 @@ def get_files_info(file_to_upload, wf_short_name, wf_version, somatic_or_germli
153148
] + (['tbi'] if file_to_upload.endswith('.tbi') else []))
154149

155150
file_info['fileName'] = new_fname
151+
152+
file_info['info'] = {
153+
'data_category': variant_type_to_data_type_etc[variant_type][0],
154+
'data_subtype': None
155+
}
156+
156157
extra_info = {}
157158
if new_fname.endswith('.vcf.gz'):
158-
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
159+
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1][0]
160+
file_info['info']['data_subtype'] = variant_type_to_data_type_etc[variant_type][1][1]
159161
elif new_fname.endswith('.vcf.gz.tbi'):
160162
file_info['dataType'] = 'VCF Index'
161163
elif new_fname.endswith('.tgz'):
162164
if new_fname.endswith('-supplement.tgz'):
163-
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
165+
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1][0]
166+
file_info['info']['data_subtype'] = variant_type_to_data_type_etc[variant_type][1][1]
164167
elif new_fname.endswith('_metrics.tgz'):
165-
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1]
168+
file_info['dataType'] = variant_type_to_data_type_etc[variant_type][1][0]
169+
file_info['info']['data_subtype'] = variant_type_to_data_type_etc[variant_type][1][1]
166170
else:
167171
sys.exit('Error: unknown file type "%s"' % file_to_upload)
168172

@@ -175,10 +179,6 @@ def get_files_info(file_to_upload, wf_short_name, wf_version, somatic_or_germli
175179
else:
176180
sys.exit('Error: unknown file type "%s"' % file_to_upload)
177181

178-
file_info['info'] = {
179-
'data_category': variant_type_to_data_type_etc[variant_type][0]
180-
}
181-
182182
if wf_short_name in (['sanger-wgs', 'sanger-wxs']):
183183
file_info['info']['analysis_tools'] = variant_type_to_data_type_etc[variant_type][2]
184184
elif wf_short_name in (['gatk-mutect2']):

tests/data/1a7df8db-cb54-43e1-952f-3c458a14800a.variant_calling.payload.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
"fileAccess": "controlled",
3333
"info": {
3434
"data_category": "Simple Nucleotide Variation",
35+
"data_subtype": null,
3536
"analysis_tools": [
3637
"CaVEMan"
3738
]
@@ -46,6 +47,7 @@
4647
"fileAccess": "controlled",
4748
"info": {
4849
"data_category": "Simple Nucleotide Variation",
50+
"data_subtype": null,
4951
"analysis_tools": [
5052
"CaVEMan"
5153
]

tests/data/66f84b1e-dad1-4981-916e-07e62ff53410.variant_calling_supplement.payload.json

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,9 @@
3131
"fileMd5sum": "36de1a4788a5ca24eddaaf0bac7c42c2",
3232
"fileAccess": "controlled",
3333
"info": {
34-
"data_category": null,
35-
"analysis_tools": [
36-
"CaVEMan",
37-
"Pindel"
38-
],
34+
"data_category": "Quality Control Metrics",
35+
"data_subtype": "Runtime Stats",
36+
"analysis_tools": null,
3937
"description": "Files contain timing information for different processing steps",
4038
"files": [
4139
"WGS_SA610228.time.verify_WT",
@@ -56,7 +54,7 @@
5654
]
5755
},
5856
"fileName": "TEST-PR.DO250122.SA610148.wgs.20210414.sanger-wxs.somatic.timings-supplement.tgz",
59-
"dataType": "Variant Calling Supplement"
57+
"dataType": "Analysis QC"
6058
},
6159
{
6260
"fileType": "TGZ",
@@ -65,6 +63,7 @@
6563
"fileAccess": "controlled",
6664
"info": {
6765
"data_category": "Simple Nucleotide Variation",
66+
"data_subtype": "SNV Supplement",
6867
"analysis_tools": [
6968
"CaVEMan"
7069
],
@@ -96,6 +95,7 @@
9695
"fileAccess": "controlled",
9796
"info": {
9897
"data_category": "Simple Nucleotide Variation",
98+
"data_subtype": "InDel Supplement",
9999
"analysis_tools": [
100100
"Pindel"
101101
],

tests/data/735a4c34-c928-4f03-957b-fe808df68f63.qc_metrics.payload.json

Lines changed: 12 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
"fileAccess": "controlled",
3333
"info": {
3434
"data_category": "Quality Control Metrics",
35+
"data_subtype": "Cross Sample Contamination",
3536
"analysis_tools": [
36-
"GATK-CalculateContamination"
37+
"GATK:CalculateContamination"
3738
],
3839
"description": "Cross sample contamination estimated by GATK CalculateContamination tool",
3940
"files_in_tgz": [
@@ -48,7 +49,7 @@
4849
}
4950
},
5051
"fileName": "TEST-PR.DO250122.SA610149.wgs.20210414.gatk-mutect2.somatic.contamination_metrics.tgz",
51-
"dataType": "Cross Sample Contamination"
52+
"dataType": "Analysis QC"
5253
},
5354
{
5455
"fileType": "TGZ",
@@ -57,8 +58,9 @@
5758
"fileAccess": "controlled",
5859
"info": {
5960
"data_category": "Quality Control Metrics",
61+
"data_subtype": "Cross Sample Contamination",
6062
"analysis_tools": [
61-
"GATK-CalculateContamination"
63+
"GATK:CalculateContamination"
6264
],
6365
"description": "Cross sample contamination estimated by GATK CalculateContamination tool",
6466
"files_in_tgz": [
@@ -73,7 +75,7 @@
7375
}
7476
},
7577
"fileName": "TEST-PR.DO250122.SA610148.wgs.20210414.gatk-mutect2.somatic.contamination_metrics.tgz",
76-
"dataType": "Cross Sample Contamination"
78+
"dataType": "Analysis QC"
7779
},
7880
{
7981
"fileType": "TGZ",
@@ -82,8 +84,9 @@
8284
"fileAccess": "controlled",
8385
"info": {
8486
"data_category": "Quality Control Metrics",
87+
"data_subtype": "Variant Filtering Stats",
8588
"analysis_tools": [
86-
"GATK-FilterMutectCalls"
89+
"GATK:FilterMutectCalls"
8790
],
8891
"description": "Information on the probability threshold chosen to optimize the F score and the number of false positives and false negatives from each filter to be expected from this choice.",
8992
"files_in_tgz": [
@@ -147,7 +150,7 @@
147150
}
148151
},
149152
"fileName": "TEST-PR.DO250122.SA610148.wgs.20210414.gatk-mutect2.somatic.mutect_filtering_metrics.tgz",
150-
"dataType": "Mutect2 Filtering Stats"
153+
"dataType": "Analysis QC"
151154
},
152155
{
153156
"fileType": "TGZ",
@@ -156,8 +159,9 @@
156159
"fileAccess": "controlled",
157160
"info": {
158161
"data_category": "Quality Control Metrics",
162+
"data_subtype": "Variant Callable Stats",
159163
"analysis_tools": [
160-
"GATK-Mutect2"
164+
"GATK:Mutect2"
161165
],
162166
"description": "Number of sites that are considered callable for Mutect stats with read depth equals or is higher than callable-depth which we set to default 10",
163167
"files_in_tgz": [
@@ -169,7 +173,7 @@
169173
}
170174
},
171175
"fileName": "TEST-PR.DO250122.SA610148.wgs.20210414.gatk-mutect2.somatic.mutect_callable_metrics.tgz",
172-
"dataType": "Mutect2 Callable Stats"
176+
"dataType": "Analysis QC"
173177
}
174178
],
175179
"workflow": {
Binary file not shown.
Binary file not shown.

tests/data/d354c4b2-db32-41f0-973f-db1306a99a84.qc_metrics.payload.json

Lines changed: 23 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
"fileAccess": "controlled",
3333
"info": {
3434
"data_category": "Quality Control Metrics",
35+
"data_subtype": "Cross Sample Contamination",
3536
"analysis_tools": [
36-
"verifyBamHomChk"
37+
"Sanger:verifyBamHomChk"
3738
],
3839
"description": "Cross sample contamination estimated by Sanger verifyBamHomChk.pl script",
3940
"files": [
@@ -54,7 +55,7 @@
5455
}
5556
},
5657
"fileName": "TEST-PR.DO250183.SA610228.wgs.20210414.sanger-wgs.somatic.contamination_metrics.tgz",
57-
"dataType": "Cross Sample Contamination"
58+
"dataType": "Analysis QC"
5859
},
5960
{
6061
"fileType": "TGZ",
@@ -63,8 +64,9 @@
6364
"fileAccess": "controlled",
6465
"info": {
6566
"data_category": "Quality Control Metrics",
67+
"data_subtype": "Cross Sample Contamination",
6668
"analysis_tools": [
67-
"verifyBamHomChk"
69+
"Sanger:verifyBamHomChk"
6870
],
6971
"description": "Cross sample contamination estimated by Sanger verifyBamHomChk.pl script",
7072
"files": [
@@ -85,43 +87,45 @@
8587
}
8688
},
8789
"fileName": "TEST-PR.DO250183.SA610229.wgs.20210414.sanger-wgs.somatic.contamination_metrics.tgz",
88-
"dataType": "Cross Sample Contamination"
90+
"dataType": "Analysis QC"
8991
},
9092
{
9193
"fileType": "TGZ",
92-
"fileSize": 10240,
93-
"fileMd5sum": "1a28c470a0692b2d662cdbb155a3e799",
94+
"fileSize": 283,
95+
"fileMd5sum": "75bc69d0e25d1702c00113783a496e4d",
9496
"fileAccess": "controlled",
9597
"info": {
9698
"data_category": "Quality Control Metrics",
99+
"data_subtype": "Alignment Metrics",
97100
"analysis_tools": [
98-
"bas_stats"
101+
"Sanger:bam_stats"
99102
],
100-
"description": "Alignment QC metrics generated by Sanger bas_stats.pl script",
103+
"description": "Alignment QC metrics generated by Sanger bam_stats tool",
101104
"files": [
102105
"TEST-PR.DO250183.SA610228.wgs.20200320.aln.cram.bas"
103106
]
104107
},
105108
"fileName": "TEST-PR.DO250183.SA610228.wgs.20210414.sanger-wgs.somatic.bas_metrics.tgz",
106-
"dataType": "Alignment QC"
109+
"dataType": "Aligned Reads QC"
107110
},
108111
{
109112
"fileType": "TGZ",
110-
"fileSize": 10240,
111-
"fileMd5sum": "d4e7c2608b3d876d4e818b065b34ee77",
113+
"fileSize": 283,
114+
"fileMd5sum": "4760983c802ceecb4315958ca83c7b57",
112115
"fileAccess": "controlled",
113116
"info": {
114117
"data_category": "Quality Control Metrics",
118+
"data_subtype": "Alignment Metrics",
115119
"analysis_tools": [
116-
"bas_stats"
120+
"Sanger:bam_stats"
117121
],
118-
"description": "Alignment QC metrics generated by Sanger bas_stats.pl script",
122+
"description": "Alignment QC metrics generated by Sanger bam_stats tool",
119123
"files": [
120124
"TEST-PR.DO250183.SA610229.wgs.20200320.aln.cram.bas"
121125
]
122126
},
123127
"fileName": "TEST-PR.DO250183.SA610229.wgs.20210414.sanger-wgs.somatic.bas_metrics.tgz",
124-
"dataType": "Alignment QC"
128+
"dataType": "Aligned Reads QC"
125129
},
126130
{
127131
"fileType": "TGZ",
@@ -130,6 +134,7 @@
130134
"fileAccess": "controlled",
131135
"info": {
132136
"data_category": "Quality Control Metrics",
137+
"data_subtype": "Ploidy and Purity Estimation",
133138
"analysis_tools": [
134139
"ASCAT"
135140
],
@@ -148,7 +153,7 @@
148153
}
149154
},
150155
"fileName": "TEST-PR.DO250183.SA610229.wgs.20210414.sanger-wgs.somatic.ascat_metrics.tgz",
151-
"dataType": "Ploidy and Purity Estimation"
156+
"dataType": "Analysis QC"
152157
},
153158
{
154159
"fileType": "TGZ",
@@ -157,8 +162,9 @@
157162
"fileAccess": "controlled",
158163
"info": {
159164
"data_category": "Quality Control Metrics",
165+
"data_subtype": "Genotyping Stats",
160166
"analysis_tools": [
161-
"compareBamGenotypes"
167+
"Sanger:compareBamGenotypes"
162168
],
163169
"description": "Compare genotypes of CRAM files from the same donor and produces the fraction of matched genotypes. It also checks if the inferred genders are matched.",
164170
"files": [
@@ -189,7 +195,7 @@
189195
}
190196
},
191197
"fileName": "TEST-PR.DO250183.SA610229.wgs.20210414.sanger-wgs.somatic.genotyped_gender_metrics.tgz",
192-
"dataType": "Genotyping Inferred Gender"
198+
"dataType": "Analysis QC"
193199
}
194200
],
195201
"workflow": {

tests/data/f7ca0bd4-53a7-4663-b7b6-1b3d414e0018.variant_calling.payload.json

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,9 @@
3232
"fileAccess": "controlled",
3333
"info": {
3434
"data_category": "Simple Nucleotide Variation",
35+
"data_subtype": null,
3536
"analysis_tools": [
36-
"GATK-Mutect2"
37+
"GATK:Mutect2"
3738
]
3839
},
3940
"fileName": "TEST-PR.DO250122.SA610148.wgs.20210414.gatk-mutect2.somatic.snv.vcf.gz",
@@ -46,8 +47,9 @@
4647
"fileAccess": "controlled",
4748
"info": {
4849
"data_category": "Simple Nucleotide Variation",
50+
"data_subtype": null,
4951
"analysis_tools": [
50-
"GATK-Mutect2"
52+
"GATK:Mutect2"
5153
]
5254
},
5355
"fileName": "TEST-PR.DO250122.SA610148.wgs.20210414.gatk-mutect2.somatic.snv.vcf.gz.tbi",

0 commit comments

Comments
 (0)