Skip to content

Commit bf4dd7c

Browse files
authored
Merge pull request #88 from icgc-argo/prep-mutect2-qc.0.1.2.0
Prep mutect2 qc.0.1.2.0
2 parents b76797f + 6a6d678 commit bf4dd7c

5 files changed

Lines changed: 127 additions & 16 deletions

File tree

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
#<METADATA>Ln prior of deletion of length 10=-20.72326583694641
2+
#<METADATA>Ln prior of deletion of length 9=-20.72326583694641
3+
#<METADATA>Ln prior of deletion of length 8=-20.72326583694641
4+
#<METADATA>Ln prior of deletion of length 7=-20.72326583694641
5+
#<METADATA>Ln prior of deletion of length 6=-20.72326583694641
6+
#<METADATA>Ln prior of deletion of length 5=-20.72326583694641
7+
#<METADATA>Ln prior of deletion of length 4=-20.72326583694641
8+
#<METADATA>Ln prior of deletion of length 3=-18.120246963748748
9+
#<METADATA>Ln prior of deletion of length 2=-20.72326583694641
10+
#<METADATA>Ln prior of deletion of length 1=-18.286701229849623
11+
#<METADATA>Ln prior of SNV=-14.821551779841153
12+
#<METADATA>Ln prior of insertion of length 1=-20.72326583694641
13+
#<METADATA>Ln prior of insertion of length 2=-18.254241200832894
14+
#<METADATA>Ln prior of insertion of length 3=-20.72326583694641
15+
#<METADATA>Ln prior of insertion of length 4=-20.72326583694641
16+
#<METADATA>Ln prior of insertion of length 5=-20.72326583694641
17+
#<METADATA>Ln prior of insertion of length 6=-20.72326583694641
18+
#<METADATA>Ln prior of insertion of length 7=-20.72326583694641
19+
#<METADATA>Ln prior of insertion of length 8=-20.72326583694641
20+
#<METADATA>Ln prior of insertion of length 9=-20.72326583694641
21+
#<METADATA>Ln prior of insertion of length 10=-20.72326583694641
22+
#<METADATA>Background beta-binomial cluster=weight = 0.2243, alpha = 1.02, beta = 4.78
23+
#<METADATA>High-AF beta-binomial cluster=weight = 0.0245, alpha = 9.99, beta = 1.04
24+
#<METADATA>Binomial cluster=weight = 0.5275, mean = 0.304
25+
#<METADATA>Binomial cluster=weight = 0.2236, mean = 0.079
26+
#<METADATA>threshold=0.577
27+
#<METADATA>fdr=0.178
28+
#<METADATA>sensitivity=0.768
29+
filter FP FDR FN FNR
30+
weak_evidence 1.77 0.05 3.7 0.1
31+
strand_bias 3.0 0.09 3.02 0.08
32+
contamination 0.02 0.0 0.0 0.0
33+
normal_artifact 0.56 0.02 0.67 0.02
34+
orientation 0.49 0.01 0.13 0.0
35+
slippage 0.0 0.0 0.0 0.0
36+
haplotype 0.45 0.01 0.12 0.0
37+
germline 0.48 0.01 0.43 0.01
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
statistic value
2+
callable 2.834460576E9

tools/prep-mutect2-qc/prep-mutect2-qc.nf

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
*/
2424

2525
nextflow.enable.dsl = 2
26-
version = '0.1.1.0'
26+
version = '0.1.2.0'
2727

2828
params.qc_files = ""
2929
params.container_version = ""

tools/prep-mutect2-qc/prep-mutect2-qc.py

Lines changed: 84 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,47 @@ def run_cmd(cmd):
3333
return
3434

3535

36+
def get_filtering_stats_extra_info(file_path):
37+
extra_info = {
38+
'filtering_stats': {}
39+
}
40+
# just to make it simpler, go through the file twice
41+
with open(file_path, 'r') as fp:
42+
for row in fp.readlines():
43+
row.strip()
44+
row = row.replace('#<METADATA>', '')
45+
if row.startswith('threshold=') or row.startswith('fdr=') or row.startswith('sensitivity='):
46+
key, value = row.split('=')
47+
extra_info[key] = float(value)
48+
49+
with open(file_path, 'r') as fp:
50+
rdr = csv.DictReader(filter(lambda row: row[0] != '#', fp), delimiter='\t')
51+
for row in rdr:
52+
filter_name = row.pop('filter')
53+
values = row
54+
for k, v in values.items():
55+
values[k] = float(v)
56+
extra_info['filtering_stats'][filter_name] = values
57+
58+
return extra_info
59+
60+
61+
def get_callable_stats_extra_info(file_path):
62+
extra_info = {}
63+
with open(file_path, 'r') as fp:
64+
for row in fp.readlines():
65+
if row.startswith('callable'):
66+
cols = row.strip().split()
67+
extra_info.update({
68+
'callable': int(float(cols[1]))
69+
})
70+
break
71+
72+
return extra_info
73+
74+
3675
def get_contamination_extra_info(file_path):
3776
extra_info = {}
38-
print(file_path)
3977
with open(file_path, 'r') as fp:
4078
for row in fp.readlines():
4179
if row.startswith('sample'):
@@ -53,12 +91,18 @@ def get_contamination_extra_info(file_path):
5391
def main(args):
5492
qc_file_patterns = {
5593
'tumour_contamination': '*.tumour.*_metrics',
56-
'normal_contamination': '*.normal.*_metrics'
94+
'normal_contamination': '*.normal.*_metrics',
95+
'callable_stats': '*.stats',
96+
'filtering_stats': '*.filtering-stats'
5797
}
5898

5999
# only contamination metrics for now, may have more later
60100
description = {
61-
'contamination': 'Cross sample contamination estimated by GATK CalculateContamination tool'
101+
'contamination': 'Cross sample contamination estimated by GATK CalculateContamination tool',
102+
'callable_stats': 'Number of sites that are considered callable for Mutect stats with read depth equals or '
103+
'is higher than callable-depth which we set to default 10',
104+
'filtering_stats': 'Information on the probability threshold chosen to optimize the F score '
105+
'and the number of false positives and false negatives from each filter to be expected from this choice.'
62106
}
63107

64108
for qc_file_pttn in qc_file_patterns:
@@ -68,21 +112,47 @@ def main(args):
68112
}
69113
metrics = {}
70114
tar_name = None
71-
cont_metrics_file = None
72-
qc_files = glob.glob(qc_file_patterns[qc_file_pttn])
73-
for f in qc_files:
74-
if f.endswith('contamination_metrics'):
75-
cont_metrics_file = f
76-
tar_name = f'{f}.tgz'
77-
extra_info['files_in_tgz'].append(f)
115+
if qc_file_pttn.endswith('_contamination'):
116+
cont_metrics_file = None
117+
qc_files = glob.glob(qc_file_patterns[qc_file_pttn])
118+
for f in qc_files:
119+
if f.endswith('contamination_metrics'):
120+
cont_metrics_file = f
121+
tar_name = f'{f}.tgz'
122+
extra_info['files_in_tgz'].append(f)
123+
124+
extra_info['description'] = description['contamination']
125+
126+
metrics = get_contamination_extra_info(cont_metrics_file)
127+
128+
elif qc_file_pttn.endswith('filtering_stats'):
129+
filtering_stats_file = None
130+
qc_files = glob.glob(qc_file_patterns[qc_file_pttn])
131+
for f in qc_files:
132+
if f.endswith('.filtering-stats'):
133+
filtering_stats_file = f
134+
tar_name = f'{f}.filtering_metrics.tgz'
135+
extra_info['files_in_tgz'].append(f)
136+
137+
extra_info['description'] = description['filtering_stats']
138+
139+
metrics = get_filtering_stats_extra_info(filtering_stats_file)
140+
141+
elif qc_file_pttn.endswith('callable_stats'):
142+
callable_stats_file = None
143+
qc_files = glob.glob(qc_file_patterns[qc_file_pttn])
144+
for f in qc_files:
145+
if f.endswith('.stats'):
146+
callable_stats_file = f
147+
tar_name = f'{f}.callable_metrics.tgz'
148+
extra_info['files_in_tgz'].append(f)
78149

79-
extra_info['description'] = description['contamination']
150+
extra_info['description'] = description['callable_stats']
80151

81-
# TODO: populate metrics info
82-
metrics = get_contamination_extra_info(cont_metrics_file)
152+
metrics = get_callable_stats_extra_info(callable_stats_file)
83153

84154
extra_info.update({"metrics": metrics})
85-
extra_json = f'{cont_metrics_file}.extra_info.json'
155+
extra_json = f'{qc_file_pttn}.extra_info.json'
86156
extra_info['files_in_tgz'].append(extra_json)
87157
with open(extra_json, 'w') as j:
88158
j.write(json.dumps(extra_info, indent=2))

tools/prep-mutect2-qc/tests/test-1.nf.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
"data/141f7e0b869a059953894fbf920e8134.tumour.contamination_metrics",
44
"data/141f7e0b869a059953894fbf920e8134.tumour.segmentation_metrics",
55
"data/82b4e5f5d6a28262cb3d6974f40d7f9a.normal.contamination_metrics",
6-
"data/82b4e5f5d6a28262cb3d6974f40d7f9a.normal.segmentation_metrics"
6+
"data/82b4e5f5d6a28262cb3d6974f40d7f9a.normal.segmentation_metrics",
7+
"data/merged-mutect-stats.stats",
8+
"data/filter-mutect-calls.filtering-stats"
79
]
810
}

0 commit comments

Comments
 (0)