@@ -33,9 +33,47 @@ def run_cmd(cmd):
3333 return
3434
3535
36+ def get_filtering_stats_extra_info (file_path ):
37+ extra_info = {
38+ 'filtering_stats' : {}
39+ }
40+ # just to make it simpler, go through the file twice
41+ with open (file_path , 'r' ) as fp :
42+ for row in fp .readlines ():
43+ row .strip ()
44+ row = row .replace ('#<METADATA>' , '' )
45+ if row .startswith ('threshold=' ) or row .startswith ('fdr=' ) or row .startswith ('sensitivity=' ):
46+ key , value = row .split ('=' )
47+ extra_info [key ] = float (value )
48+
49+ with open (file_path , 'r' ) as fp :
50+ rdr = csv .DictReader (filter (lambda row : row [0 ] != '#' , fp ), delimiter = '\t ' )
51+ for row in rdr :
52+ filter_name = row .pop ('filter' )
53+ values = row
54+ for k , v in values .items ():
55+ values [k ] = float (v )
56+ extra_info ['filtering_stats' ][filter_name ] = values
57+
58+ return extra_info
59+
60+
61+ def get_callable_stats_extra_info (file_path ):
62+ extra_info = {}
63+ with open (file_path , 'r' ) as fp :
64+ for row in fp .readlines ():
65+ if row .startswith ('callable' ):
66+ cols = row .strip ().split ()
67+ extra_info .update ({
68+ 'callable' : int (float (cols [1 ]))
69+ })
70+ break
71+
72+ return extra_info
73+
74+
3675def get_contamination_extra_info (file_path ):
3776 extra_info = {}
38- print (file_path )
3977 with open (file_path , 'r' ) as fp :
4078 for row in fp .readlines ():
4179 if row .startswith ('sample' ):
@@ -53,12 +91,18 @@ def get_contamination_extra_info(file_path):
5391def main (args ):
5492 qc_file_patterns = {
5593 'tumour_contamination' : '*.tumour.*_metrics' ,
56- 'normal_contamination' : '*.normal.*_metrics'
94+ 'normal_contamination' : '*.normal.*_metrics' ,
95+ 'callable_stats' : '*.stats' ,
96+ 'filtering_stats' : '*.filtering-stats'
5797 }
5898
5999 # only contamination metrics for now, may have more later
60100 description = {
61- 'contamination' : 'Cross sample contamination estimated by GATK CalculateContamination tool'
101+ 'contamination' : 'Cross sample contamination estimated by GATK CalculateContamination tool' ,
102+ 'callable_stats' : 'Number of sites that are considered callable for Mutect stats with read depth equals or '
103+ 'is higher than callable-depth which we set to default 10' ,
104+ 'filtering_stats' : 'Information on the probability threshold chosen to optimize the F score '
105+ 'and the number of false positives and false negatives from each filter to be expected from this choice.'
62106 }
63107
64108 for qc_file_pttn in qc_file_patterns :
@@ -68,21 +112,47 @@ def main(args):
68112 }
69113 metrics = {}
70114 tar_name = None
71- cont_metrics_file = None
72- qc_files = glob .glob (qc_file_patterns [qc_file_pttn ])
73- for f in qc_files :
74- if f .endswith ('contamination_metrics' ):
75- cont_metrics_file = f
76- tar_name = f'{ f } .tgz'
77- extra_info ['files_in_tgz' ].append (f )
115+ if qc_file_pttn .endswith ('_contamination' ):
116+ cont_metrics_file = None
117+ qc_files = glob .glob (qc_file_patterns [qc_file_pttn ])
118+ for f in qc_files :
119+ if f .endswith ('contamination_metrics' ):
120+ cont_metrics_file = f
121+ tar_name = f'{ f } .tgz'
122+ extra_info ['files_in_tgz' ].append (f )
123+
124+ extra_info ['description' ] = description ['contamination' ]
125+
126+ metrics = get_contamination_extra_info (cont_metrics_file )
127+
128+ elif qc_file_pttn .endswith ('filtering_stats' ):
129+ filtering_stats_file = None
130+ qc_files = glob .glob (qc_file_patterns [qc_file_pttn ])
131+ for f in qc_files :
132+ if f .endswith ('.filtering-stats' ):
133+ filtering_stats_file = f
134+ tar_name = f'{ f } .filtering_metrics.tgz'
135+ extra_info ['files_in_tgz' ].append (f )
136+
137+ extra_info ['description' ] = description ['filtering_stats' ]
138+
139+ metrics = get_filtering_stats_extra_info (filtering_stats_file )
140+
141+ elif qc_file_pttn .endswith ('callable_stats' ):
142+ callable_stats_file = None
143+ qc_files = glob .glob (qc_file_patterns [qc_file_pttn ])
144+ for f in qc_files :
145+ if f .endswith ('.stats' ):
146+ callable_stats_file = f
147+ tar_name = f'{ f } .callable_metrics.tgz'
148+ extra_info ['files_in_tgz' ].append (f )
78149
79- extra_info ['description' ] = description ['contamination ' ]
150+ extra_info ['description' ] = description ['callable_stats ' ]
80151
81- # TODO: populate metrics info
82- metrics = get_contamination_extra_info (cont_metrics_file )
152+ metrics = get_callable_stats_extra_info (callable_stats_file )
83153
84154 extra_info .update ({"metrics" : metrics })
85- extra_json = f'{ cont_metrics_file } .extra_info.json'
155+ extra_json = f'{ qc_file_pttn } .extra_info.json'
86156 extra_info ['files_in_tgz' ].append (extra_json )
87157 with open (extra_json , 'w' ) as j :
88158 j .write (json .dumps (extra_info , indent = 2 ))
0 commit comments