-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathstat_lib.py
More file actions
1682 lines (1340 loc) · 72.5 KB
/
stat_lib.py
File metadata and controls
1682 lines (1340 loc) · 72.5 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
stat_lib
Library containing core classes and functions used in
ramediesDN, ramediesCH and ramedies_CH_IND.
##############
Dependencies:
1. numpy
2. os
3. scipy
4. init_objs_lib
5. cfg
##############
Classes:
Variant class: class with instances containing information about particular variants
Gene class: class containing score-mutational target distributions as well as
the variant information for each gene stratified by annotation.
CH_variant class: class storing information about CH variants in the form of pairs
of Variant class instances.
VariantCollection class: instances store variant (Variant class) information
stratified by individuals. Used in ramediesCH and ramedies_CH_IND algorithms
##############
Functions are subdivided into blocks. See the rest of the document for details.
"""
import numpy as np
import init_objs_lib
import cfg
from os import path, remove
from scipy.stats import norm, binom, poisson
# ==========================================================================================
# Universal Class Definitions
# ==========================================================================================
class Variant:
"""
Variant contains relevant information about a particular variant.
Attributes:
mut_id_tuple (str, str, str, str): chromosome, 1-indexed position, reference allele, alternate allele.
var_annot (char, char): variant type 'C' for coding or 'I' for intronic; 'S' for SNV or 'I' for indel.
score (float): deleteriousness score of variant.
inher (str): variant inheritance; M = from mom, D = from dad, DN = neither/de novo.
proband_id (str): name of VCF file serving as a proband identifier.
"""
def __init__(self, mut_id_tuple, var_annot, score, inher, proband_id=None):
"""
Initializes Variant with provided variant attributes.
Args:
mut_id_tuple (str, str, str, str): chromosome, 1-indexed position, reference allele, alternate allele.
var_annot (char, char): variant type 'C' for coding or 'I' for intronic; 'S' for SNV or 'I' for indel.
score (float): deleteriousness score of variant.
inher (str): variant inheritance; M = from mom, D = from dad, DN = neither/de novo.
proband_id (str): name of VCF file serving as a proband identifier.
"""
self.mut_id_tuple = mut_id_tuple
self.var_annot = var_annot
self.score = score
self.inher = inher
self.proband_id = proband_id
def make_mut_id(self):
"""
:return: (str) corresponding to the variant ID, formatted as chromosome:ref-allele_position_alt-allele
"""
chrom = self.mut_id_tuple[0]
ref = self.mut_id_tuple[2]
pos = self.mut_id_tuple[1]
alt = self.mut_id_tuple[3]
mut_id = f"chr{chrom}:{ref}_{pos}_{alt}"
return mut_id
def print_info(self):
"""
:return: (str) containing all information contained in the Variant class
"""
mut_id = self.make_mut_id()
va = f"{self.var_annot[0]}{self.var_annot[1]}" # variant annotation e.g., CI for coding indel
return f"{mut_id}|{va}|{self.score}|{self.inher}|{self.proband_id}"
class Gene:
"""
Gene contains per-gene mutational target distributions by annotation class (coding/intronic and SNV/indel)
Gene also stores observed variants as a list of Variant instances
Attributes:
score_arr [float]: array of sorted (ascending) values of unique deleteriousness scores present in gene
mut_targ_arr [float]: array of corresponding mutational targets for each unique deleteriousness score
computed as the total mutational target of all scores equal to or higher than that score
gene_mu (float): the maximum mutational target of this gene (i.e., all variants included)
gene_length (int): total number of unique deleteriousness scores in this gene, unrelated to actual gene length
vars [Variant]: list of variants observed in this gene in a proband
mu_list [float]: list of mutational targets of all variants of a specific type
"""
def __init__(self, score_arr, mut_targ_arr):
"""
Args:
score_arr [float]: read from precomputed data/score_lists*gz files
mut_targ_arr [float]: read from precomputed data/score_lists*gz files
"""
self.score_arr, self.mut_targ_arr = zip(*sorted(zip(score_arr, mut_targ_arr)))
self.mut_targ_arr = np.array(self.mut_targ_arr)
self.gene_mu = self.mut_targ_arr[0]
self.gene_length = len(self.mut_targ_arr)
self.vars = [] # updated by the parse_VCF_lib/parse_variant_input function
self.mu_list = []
def normalize_mut_targs(self, mut_targ_norm=1.0):
"""
Normalize mutational targets so that the mutational target of the entire genome will be 1.
Normalizations are generated by parse_VCF_lib/parse_variant_scores_files function as total_mu_dict dictionary
Args:
mut_targ_norm (float): normalize mutational targets to sum to this value (default: 1.0)
"""
self.mut_targ_arr = self.mut_targ_arr / mut_targ_norm
self.gene_mu = self.mut_targ_arr[0]
def get_mu_from_score(self, score):
"""
:return: the mutational target in this gene corresponding to a given score (search binary search)
Args:
score (int): deleteriousness score to return a corresponding mutational target for
"""
if self.score_arr[0] >= score:
return self.mut_targ_arr[0]
elif self.score_arr[-1] < score:
return 0.0
ind_low = 0
ind_high = self.gene_length - 1
ind_mid = (ind_high - ind_low) // 2
while ind_high - ind_low > 1:
if self.score_arr[ind_mid] == score:
return self.mut_targ_arr[ind_mid]
elif self.score_arr[ind_low] == score:
return self.mut_targ_arr[ind_low]
elif self.score_arr[ind_high] == score:
return self.mut_targ_arr[ind_high]
elif self.score_arr[ind_mid] < score:
ind_low = ind_mid
ind_mid = (ind_high + ind_low) // 2
elif self.score_arr[ind_mid] > score:
ind_high = ind_mid
ind_mid = (ind_high + ind_low) // 2
return self.mut_targ_arr[ind_low]
def calculate_muttargs_denovo(self, sum_values=False):
"""
:return: the mutational target for the y statistic for a single annotation (used only in the de novo regime)
Args:
sum_values (bool): return the sum of mutational targets rather than the list (default: False)
"""
self.mu_list = [] # list of mutational targets which is printed into a log file
for var in self.vars:
mu = self.get_mu_from_score(var.score)
self.mu_list.append((self.gene_mu - mu) / self.gene_mu)
if sum_values:
return np.sum(self.mu_list)
return self.mu_list
# ==========================================================================================
# Compound Heterozygous Class Definitions (used by RaMeDiES-CH and RaMeDiES-IND)
# ==========================================================================================
class CH_variant:
"""
CH_variant contains relevant information for a compound heterozygous variant pair
Attributes:
var_P (Variant): paternally inherited Variant
var_M (Variant): maternally inherited variant
ensembl_gene_id (str): Ensembl gene ID where the variant is found
mu (float): maximum mutational target of maternally/paternally inherited variants
mu2 (float): normalized squared mutational target corresponding to the compound heterozygous variant pair
"""
def __init__(self, Variant_obj_tuple, ensembl_gene_id):
"""
Args:
Variant_obj_tuple ((Variant, Variant)): pair of paternally and maternally inherited variants in a gene
ensembl_gene_id: Ensembl gene ID where the variants were found in trans
"""
self.var_P = Variant_obj_tuple[0]
self.var_M = Variant_obj_tuple[1]
self.ENS_ID = ensembl_gene_id
self.mu = None
self.mu2 = None
def muttarg(self, Gene_obj_dict, norm=None):
"""
:return: mutational target of the compound heterozygous variant
Args:
Gene_obj_dict (dict): variant annotation (e.g., CI) -> Ensembl gene ID -> Gene (object)
norm: normalization factor to scale squared mutational targets
"""
# Gene_obj_dict: variant annotation -> ENSEMBL ID -> Gene object
Gene_obj_P = Gene_obj_dict[self.var_P.var_annot][self.ENS_ID]
Gene_obj_M = Gene_obj_dict[self.var_M.var_annot][self.ENS_ID]
mu_P = Gene_obj_P.get_mu_from_score(self.var_P.score)
mu_M = Gene_obj_M.get_mu_from_score(self.var_M.score)
if not norm:
norm = np.max([Gene_obj_P.gene_mu ** 2, Gene_obj_M.gene_mu ** 2])
self.mu = np.max([mu_P, mu_M]) # unadjusted mutational target of compound heterozygous variant
self.mu2 = self.mu ** 2 / norm # compound heterozygous mutational target with respect to the gene
return self.mu2
def print_info(self):
"""
:return: (str) corresponding to paternal variant & maternal variant information
"""
info_P = self.var_P.print_info()
info_M = self.var_M.print_info()
return f"{info_P}&{info_M}"
class VariantCollection:
"""
VariantCollection stores information about singular and compound heterozygous variants
Attributes:
P_vars ({Variant}): set of paternally inherited variants
M_vars ({Variant}): set of maternally inherited Variants
CH_list ([CH_variant]): list of all unique compound heterozygous variant pair objects in this gene
ensembl_gene_id (str): Ensembl gene ID
y (str): y statistic corresponding to the most surprising compound heterozygous variant mutational target
in this gene (scaled with respect to the genome)
"""
def __init__(self, ensembl_gene_id, Gene_obj_dict):
"""
Sort the variants present in Gene_obj_dict by inheritance
Args:
ensembl_gene_id (str): Ensembl gene ID
Gene_obj_dict (dict): variant annotation (e.g., CI) -> Ensembl gene ID -> Gene (object)
"""
self.P_vars = {}
self.M_vars = {}
for va in init_objs_lib.variant_type_iter():
if not Gene_obj_dict[va].get(ensembl_gene_id):
continue
Gene_obj = Gene_obj_dict[va][ensembl_gene_id]
for Variant_obj in Gene_obj.vars:
if Variant_obj.inher == 'P':
if not self.P_vars.get(Variant_obj.proband_id):
self.P_vars[Variant_obj.proband_id] = []
self.P_vars[Variant_obj.proband_id].append(Variant_obj)
elif Variant_obj.inher == 'M':
if not self.M_vars.get(Variant_obj.proband_id):
self.M_vars[Variant_obj.proband_id] = []
self.M_vars[Variant_obj.proband_id].append(Variant_obj)
self.CH_list = []
self.ENS_ID = ensembl_gene_id
self.y = None
def make_comphet_var_list(self, Gene_obj_dict):
"""
:return: list of compound heterozygous variant with the smallest mutational target per gene in this proband
Args:
Gene_obj_dict (dict): variant annotation (e.g., CI) -> Ensembl gene ID -> Gene (object)
"""
if self.P_vars == {} or self.M_vars == {}:
return None
# for each proband, find all compound heterozygous variant pairs (trans within a gene)
per_proband_CHs = {}
for proband_id, Variant_obj_list_P in self.P_vars.items():
Variant_obj_list_M = self.M_vars.get(proband_id)
if not Variant_obj_list_M:
continue
per_proband_CHs[proband_id] = {}
for P_var in Variant_obj_list_P:
for M_var in Variant_obj_list_M:
CH = CH_variant((P_var, M_var), self.ENS_ID)
mu2_CH = CH.muttarg(Gene_obj_dict)
per_proband_CHs[proband_id][mu2_CH] = CH
for proband_id, CH_dict in per_proband_CHs.items():
mu2_top = sorted(list(CH_dict.keys()))[0]
self.CH_list.append(CH_dict[mu2_top])
def calc_comphet_y(self):
"""
:return: the cohort-wide y statistic
"""
y = 0
for CH_obj in self.CH_list:
y += 1 - CH_obj.mu2
self.y = y
return y
# ==========================================================================================
# Write and load summary statistics files containing denovo VARIANT COUNT information
# ==========================================================================================
def write_by_annot_varcount(by_annot_varcount_dict, outfile_mask, args_obj):
"""
:param by_annot_varcount_dict: inheritance type (e.g., M for maternal) -> variant type (e.g., CI for coding indels)
-> variant count (number of variants of this type)
:param outfile_mask: outfile file prefix, specified by user using the --o parameter
:param args_obj: dictionary of parsed command-line arguments
:return: None, write variant counts by variant type to outfile
"""
outfile_name = f"{outfile_mask}_{cfg.varcount_sums_DN}.txt"
with open(outfile_name, 'w') as outh:
outh.write('# De novo mutation count by variant type across cohort\n')
init_objs_lib.write_run_info(outh, args_obj)
outh.write("inheritance\tvariant_type\tdenovo_mutation_count\n")
for inher, var_count_dict in by_annot_varcount_dict.items():
for var_annot, var_count in by_annot_varcount_dict[inher].items():
va_str = f"{var_annot[0]}{var_annot[1]}"
outh.write(f"{inher}\t{va_str}\t{var_count}\n")
print(f"By-annotation variant counts written to: {outfile_name}")
def make_by_annot_varcount_dict(varcount_dict, outfile_mask, args_obj):
"""
:param varcount_dict: individual ID (filename) -> inheritance type (e.g., M for maternal) ->
variant type (e.g., CI for coding indels) -> # variants of this type
:param outfile_mask: outfile file prefix, specified by user using the --o parameter
:param args_obj: dictionary of parsed command-line arguments
:return: the "by_annot_varcount_dict" used as an argument to above function "write_by_annot_varcount"
"""
by_annot_varcount_dict = {}
for ind_id, ind_varcount in varcount_dict.items():
for inher, inher_dict in ind_varcount.items():
if not by_annot_varcount_dict.get(inher):
by_annot_varcount_dict[inher] = init_objs_lib.init_varcount_dict()
for va in cfg.var_annot_list:
by_annot_varcount_dict[inher][va] += ind_varcount[inher][va]
write_by_annot_varcount(by_annot_varcount_dict, outfile_mask, args_obj)
return by_annot_varcount_dict
def load_varcount_from_file(by_annot_varcount_dict,
input_ID,
variant_annots,
suppress_indels_bool):
"""
:param by_annot_varcount_dict: inheritance type (e.g., M for maternal) -> variant type (e.g., CI for coding indels)
-> variant count (number of variants of this type)
:param input_ID: summary statistics input file ID (initially produced by the "write_by_annot_varcount" function),
specified by user using the --M parameter
:param variant_annots: variant annotation types included, specified by user using the --variant_annots parameter
:param suppress_indels_bool: boolean indicating whether to include indels (False), specified by user
:return: updated "by_annot_varcount_dict" used as an argument to above function "write_by_annot_varcount"
"""
infile_name = f"{input_ID}_{cfg.varcount_sums_DN}.txt"
if not path.isfile(infile_name):
raise AssertionError(f"Variant count file {infile_name} does not exist")
with open(infile_name, 'r') as inh:
header = None
for vc_str in inh:
if vc_str.startswith('#'):
continue
if not header:
header = vc_str.strip().split()
continue
vc_str = vc_str.strip().split()
inher = vc_str[0] # 1st column: inheritance code (cfg.inherited_from_dict)
var_annot = (vc_str[1][0], vc_str[1][1]) # 2nd column: variant annotation (cfg.var_annot_list)
if not var_annot[0] in variant_annots:
continue
if suppress_indels_bool and var_annot[1] == 'I':
continue
varcount = eval(vc_str[2]) # 3rd column: variant count
if not by_annot_varcount_dict.get(inher): # by_annot_varcount_dict update
by_annot_varcount_dict[inher] = init_objs_lib.init_varcount_dict()
by_annot_varcount_dict[inher][var_annot] += varcount
print(f"Variant counts from {infile_name} loaded")
return by_annot_varcount_dict
def load_by_annot_varcount_dict(input_IDs, variant_annots, suppress_indels_bool):
"""
:param input_IDs: comma-separated list of summary statistics input file IDs (initially produced by the
"write_by_annot_varcount" function), specified by user using the --M parameter
:param variant_annots: variant annotation types included, specified by user using the --variant_annots parameter
:param suppress_indels_bool: boolean indicating whether to include indels (False), specified by user
:return: updated "by_annot_varcount_dict" used as an argument to above function "write_by_annot_varcount"
"""
input_IDs = input_IDs.split(',')
by_annot_varcount_dict = {}
for input_ID in input_IDs:
by_annot_varcount_dict = load_varcount_from_file(by_annot_varcount_dict,
input_ID,
variant_annots,
suppress_indels_bool)
return by_annot_varcount_dict
def write_varcount_dist(varcount_dict,
outfile_mask,
input_directory='<unspecified>',
args_obj=None):
"""
:param varcount_dict: individual ID (filename) -> inheritance type (e.g., M for maternal) ->
variant type (e.g., CI for coding indels) -> # variants of this type
:param outfile_mask: outfile file prefix, specified by user using the --o parameter
:param input_directory: directory containing all tab-delimited input variant files per individual
:param args_obj: dictionary of parsed command-line arguments
:return: None, write variant count distribution to file. NOTE: this is not used in any analysis, but may
be useful for quality control analyses of variant calling across the cohort
"""
varcount_dist = {}
suffix = 'comphet'
for file_name, inher_dict in varcount_dict.items():
for inher, va_dict in inher_dict.items():
if 'DN' in inher:
suffix = 'denovo'
for va, var_num in va_dict.items():
if not varcount_dist.get(inher):
varcount_dist[inher] = {va: {} for va in cfg.var_annot_list}
if not varcount_dist[inher][va].get(var_num):
varcount_dist[inher][va][var_num] = 0
varcount_dist[inher][va][var_num] += 1
outfile_name = f"{outfile_mask}_{suffix}_{cfg.varcount_mask}.txt"
# Printing the variant count distribution to the output
with open(f"{outfile_name}", 'w') as outh:
outh.write('# Variant count distribution computed from input variant files in '+input_directory+'\n')
init_objs_lib.write_run_info(outh, args_obj)
outh.write("inheritance\tvariant_type\tvariant_count\tnumber_samples\n")
for inher, va_dict in varcount_dist.items():
for va, dist_dict in va_dict.items():
va_str = f"{va[0]}{va[1]}"
for var_num in sorted(dist_dict.keys()):
freq = dist_dict[var_num]
outh.write(f"{inher}\t{va_str}\t{var_num}\t{freq}\n")
print(f"Variant count distributions written to: {outfile_name}")
# ==========================================================================================
# Write and load summary statistics files containing denovo MUTATIONAL TARGET information
# ==========================================================================================
def load_muttargs_from_file(Gene_inst_dict,
input_file_id,
variant_annots,
suppress_indels_bool):
"""
Iteratively called by load_muttargs_from_filelist function below
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:param input_file_id: summary statistics input file ID (initially produced by the "write_muttargs" function),
specified by user using the --M parameter
:param variant_annots: variant annotation types included, specified by user using the --variant_annots parameter
:param suppress_indels_bool: boolean indicating whether to include indels (False), specified by user
:return: updated (previously initialized) "Gene_inst_dict" used as an argument to below function "write_muttargs"
"""
infile_name = f"{input_file_id}_{cfg.muttargs_list_DN_ID}.txt"
if not path.isfile(infile_name):
raise AssertionError(f"mutational target file {infile_name} does not exist")
with open(infile_name, 'r') as inh:
accepted_vars = 0
discarded_vars = 0
header = None
for mu_str in inh:
if mu_str.startswith('#'):
continue
if not header:
header = mu_str.strip().split()
continue
mu_str = mu_str.strip().split()
variant_annot_types, ensembl_gene_id, mutational_targets = mu_str[0], mu_str[1], mu_str[2]
mutational_targets = [eval(i) for i in mutational_targets.split(',')]
variant_annot_types = (variant_annot_types[0], variant_annot_types[1])
# Filter by specified annotations
if not variant_annot_types[0] in variant_annots:
continue
if suppress_indels_bool and variant_annot_types[1] == 'I':
continue
for mu in mutational_targets:
# Filter by genes accepted in gene_instances
if not Gene_inst_dict[variant_annot_types].get(ensembl_gene_id):
discarded_vars += 1
continue
# gene_instances update
Gene_inst_dict[variant_annot_types][ensembl_gene_id].mu_list.append(mu)
accepted_vars += 1
print(f"Accepted {accepted_vars} variant targets, {discarded_vars} discarded")
return Gene_inst_dict
def load_muttargs_from_filelist(Gene_inst_dict,
input_file_list,
variant_annots,
suppress_indels_bool):
"""
Called if --metadata_run_mode is enabled by cohort recurrence methods RaMeDiES-DN or RaMeDiES-CH
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:param input_file_list: comma-separated list of summary statistics input file IDs (initially produced by the
"write_muttargs" function), specified by user using the --M parameter
:param variant_annots: variant annotation types included, specified by user using the --variant_annots parameter
:param suppress_indels_bool: boolean indicating whether to include indels (False), specified by user
:return: updated (previously initialized) "Gene_inst_dict" used as an argument to below function "write_muttargs"
"""
input_file_list = input_file_list.split(',')
for input_file_id in input_file_list:
Gene_inst_dict = load_muttargs_from_file(Gene_inst_dict,
input_file_id,
variant_annots,
suppress_indels_bool)
return Gene_inst_dict
def write_muttargs(Gene_inst_dict,
outfile_mask,
input_directory="<unspecified>",
args_obj=None):
"""
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:param outfile_mask: outfile file prefix, specified by user using the --o parameter
:param input_directory: directory containing all tab-delimited input variant files per individual
:param args_obj: dictionary of parsed command-line arguments
:return: None, write mutational targets by variant type to outfile
"""
outfile_name = f"{outfile_mask}_{cfg.muttargs_list_DN_ID}.txt"
with open(outfile_name, 'w') as outh:
outh.write('# Mutational targets computed from input variant files located in: '+input_directory+'\n')
init_objs_lib.write_run_info(outh, args_obj)
outh.write(f"variant_type\tensembl_gene_id\tper_patient_mutational_targets\n")
for va, va_Gene_obj_dict in Gene_inst_dict.items():
for ENS_ID, Gene_obj in va_Gene_obj_dict.items():
muttarg_list = Gene_obj.calculate_muttargs_denovo()
if len(muttarg_list) == 0:
continue
if args_obj and args_obj.write_muttarg_sums:
muttarg_str = np.sum(muttarg_list)
else:
muttarg_str = ','.join([str(i) for i in muttarg_list])
outh.write(f"{va[0]}{va[1]}\t{ENS_ID}\t{muttarg_str}\n")
print(f"Variant mutational targets written to: {outfile_name}")
# ==========================================================================================
# Statistics functions used by RaMeDiES-DN (de novo recurrence)
# ==========================================================================================
def count_genes(Gene_inst_dict, return_dict=False):
"""
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:param return_dict: boolean whether to return the complete dictionary (True) or the number of genes (False)
:return: the number of genes under consideration (or a dictionary of gene_id -> True of the genes under consideration
consideration for the variant types in question). Used as the Bonferroni correction factor for
cohort-level recurrence functions RaMeDiES-DN and RaMeDiES-CH.
"""
ensg_gene_id_dict = {}
for va in init_objs_lib.variant_type_iter():
for ensg_gene_id in Gene_inst_dict[va].keys():
ensg_gene_id_dict[ensg_gene_id] = True
if return_dict:
return ensg_gene_id_dict
return len(ensg_gene_id_dict.keys())
def y_from_mu(gene_mus):
"""
Called by count_y function
:param gene_mus: dictionary of Ensembl gene ID -> array of mutational targets (across a cohort)
:return: sum of observed mutational targets per gene across a cohort
"""
gene_ys = {} # Ensembl gene ID -> sum of observed mutational targets across all individuals
for ensg_gene_id, muttarg_arr in gene_mus.items():
if muttarg_arr == []:
continue
if not gene_ys.get(ensg_gene_id):
gene_ys[ensg_gene_id] = 0
gene_ys[ensg_gene_id] += np.sum(muttarg_arr)
return gene_ys
def count_y(Gene_inst_dict):
"""
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:return: sum of observed mutational targets per gene across a cohort
"""
gene_mus = {} # gene_mus: ENSEMBL_ID -> array of mutational tergets
for va, va_Gene_obj_dict in Gene_inst_dict.items():
for ensg_gene_id, Gene_obj in va_Gene_obj_dict.items():
if not gene_mus.get(ensg_gene_id):
gene_mus[ensg_gene_id] = []
gene_mus[ensg_gene_id] += Gene_obj.mu_list
return y_from_mu(gene_mus) # gene_ys: ENSEMBL ID -> y statistic
def make_gene_mutdict(Gene_inst_dict):
"""
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:return: dictionary of Ensembl gene ID -> list of observed Variant objects (from input tab-delimited variant files)
"""
gene_mutdict = {}
for va, va_Gene_obj_dict in Gene_inst_dict.items():
for ensg_gene_id, Gene_obj in va_Gene_obj_dict.items():
for Variant_obj in Gene_obj.vars:
if not gene_mutdict.get(ensg_gene_id):
gene_mutdict[ensg_gene_id] = []
gene_mutdict[ensg_gene_id].append(Variant_obj)
return gene_mutdict
def calc_denovo_lambda(varcount_dict, Gene_inst_dict, ensg_gene_id):
"""
:param varcount_dict: dictionary of inheritance type (e.g., M for maternal) -> variant annotation type
(e.g., CI for coding indels) -> total observed variants of this type
:param Gene_inst_dict: variant annotation type (e.g., CI) -> Ensembl Gene ID -> Gene (object)
:param ensg_gene_id: Ensembl gene ID
:return: the expected variant count for this gene (i.e., lambda parameter)
"""
lambda_parameter = 0
for va in cfg.var_annot_list:
if Gene_inst_dict[va].get(ensg_gene_id):
# For each variant annotation type, multiply the variant count by the mutational target.
lambda_parameter += varcount_dict["DN"][va] * Gene_inst_dict[va][ensg_gene_id].gene_mu
return lambda_parameter
def false_diag_rate(lambda_parameter, num_samples):
"""
Computes the binomial upper bound specified by cfg.false_diag_rate
To obtain an estimate of the fraction of incorrect diagnoses (i.e., number of patients with a variant in a
recurrently-hit gene, where that variant in that gene in that patient is NOT a true diagnosis), divide the
false diagnosis rate (computed here as the binomial upper bound specified by cfg.false_diag_rate) by the number of
variants occurring in a given gene across the cohort.
:param lambda_parameter: the expected variant count for this gene (i.e., lambda parameter)
:param num_samples: size of the cohort
:return: the false diagnosis rate based on the expected mutational target of all variants and the cohort size
"""
if num_samples == -1:
return np.nan
p = lambda_parameter / num_samples # estimate of the binomial probability
# true diagnosis rate (i.e., fraction of individuals where this gene is the correct diagnosis)
higher_p = 1 - cfg.false_diag_rate
return binom.ppf(higher_p, num_samples, p)
def irwinhall_cdf(x, n):
"""
:param x: statistic
:param n: Irwin-Hall parameter (number of elements in a sum of 0-1 Uniforms)
:return: value of the cumulative density function (CDF) of the Irwin-Hall distribution parameterized by n and
computed at x
"""
# Setting CDF = 1 to the right of the realm of definition
if x >= n:
return 1
# Setting CDF = 0 to the left of the realm of definition
elif x <= 0:
return 0
# Central Limit theorem gives a good approximation on about n > 8
if n > cfg.IH_norm_approx_thr:
return norm.cdf(x, loc=n / 2, scale=np.sqrt(n / 12))
k = int(x)
stat = 0
for i in range(k + 1):
xmi = x - i
if xmi == 0:
continue
if i == 0:
log_factorial_i = 0
else:
log_factorial_i = np.sum(np.log([j for j in range(1, i + 1)]))
if n - i == 0:
log_factorial_nmi = 0
else:
log_factorial_nmi = np.sum(np.log([j for j in range(1, (n - i) + 1)]))
# Using factorial transform here to make the calculations more robust
log_prod_xmi = n * np.log(x - i)
val = ((-1) ** i) * np.exp(-log_factorial_i - log_factorial_nmi + log_prod_xmi)
stat += val
return stat
def denovo_prob(lambda_parameter):
"""
:param lambda_parameter: expected variant count, see "calc_denovo_lambda"
:return: probability of observing a variant at all given the expected variant count in a cohort
"""
return 1.0 - np.exp(-lambda_parameter)
def process_single_gene(y, lambda_parameter, gene_constraint_weight, num_samples):
"""
For more information on the weighted FDR procedure, see Genovese et al., 2006 for details
(https://www.jstor.org/stable/20441304)
:param y: sum of mutational targets of independent variants within a single gene
:param lambda_parameter: (float) expected variant count in this gene
:param gene_constraint_weight: gene constraint based weight with mean of 1 used in the weighted FDR procedure
:param num_samples: number of samples in the cohort
:return: the Irwin-Hall/Poisson recurrence statistic computed for a single gene
"""
P_dnv = denovo_prob(lambda_parameter)
P_val = 0
# The "infinite" sum is computed with the first ~1000 values
for i in range(1, cfg.maxIHval):
# Irwin-Hall survival function
P_IH = 1 - irwinhall_cdf(y, i)
# Very small values may be negative due to computational errors
if P_IH < 0:
P_IH = 0
P_Pois = poisson.pmf(i, lambda_parameter)
# Element of the sum
add_P = P_IH * P_Pois
# Early stop in case of the added value being small with respect to the already calculated value
if P_val != 0 and add_P / P_val < cfg.pval_precision:
break
P_val += add_P
# Probability of an observed mutational pattern given that a single mutation has been observed. Used in Q-Q plots
P_cond = P_val / P_dnv
# Q value of the weighted FDR procedure
if gene_constraint_weight != 0:
Q_val = P_val / gene_constraint_weight
else:
Q_val = 1.0
# Computing the false diagnosis rate
f_diag_rate = false_diag_rate(lambda_parameter, num_samples)
return [Q_val, P_val, P_cond, P_dnv, lambda_parameter, f_diag_rate, gene_constraint_weight]
def generate_output_line(pvalues_array,
out_handle,
ensembl_gene_id,
ensembl_to_genename,
gene_mutdict,
first_line=False):
"""
:param pvalues_array: output of "process_single_gene", array of [Q_val, P_val, P_cond, P_dnv, lambda_parameter,
false_diagnosis_rate, gene_constraint_weight]
:param out_handle: output file object (opened for write)
:param ensembl_gene_id: Ensembl gene ID
:param ensembl_to_genename: dictionary of ensembl gene ID -> HGNC gene name
(see init_objs_lib/make_ENS2GeneID_dict for details) for details)
:param gene_mutdict: output of "make_gene_mutdict", array of Variant objects corresponding to variants present in
tab-delimited input variant files
:param first_line: boolean indicating whether to write a header (True)
:return: None, but write properly formatted lines to open file handle as specified
"""
if first_line:
out_handle.write('# Genes harboring de novo mutations across cohort ranked by Q-value\n')
out_handle.write('# file_names should include relevant patient identifiers\n')
out_handle.write('# Variant inheritance is M=maternally-inherited, P=paternally-inherited, and DN=denovo\n')
out_handle.write('# variant_info is &-delimited values: chromsome:refallele_position_altallele|variant_type|' +
'variant_functionality_score|variant_inheritance|file_name\n')
print_arr = ["file_names",
"ensembl_gene_id",
"gene_name",
"Q_val",
"P_val",
"P_cond",
"P_dnv",
"poisson_lambda",
"false_diag_rate",
"gene_weight",
"variant_info"]
out_handle.write('\t'.join(print_arr) + '\n')
return None
# DEFAULT run: variant and individual information is included in output
if gene_mutdict:
# List of individuals carrying detected mutations in the gene
ind_list = [Variant_obj.proband_id for Variant_obj in gene_mutdict[ensembl_gene_id]]
ind_list = ','.join(ind_list)
# List of variant information
var_info_list = [Variant_obj.print_info() for Variant_obj in gene_mutdict[ensembl_gene_id]]
var_info_list = ','.join(var_info_list)
# METADATA run: variant- and individual-level data is NOT printed
else:
ind_list = '.'
var_info_list = '.'
gene_name = ensembl_to_genename.get(ensembl_gene_id)
if not gene_name:
gene_name = '.'
print_arr = [ind_list,
ensembl_gene_id,
gene_name,
str(pvalues_array[0]),
str(pvalues_array[1]),
str(pvalues_array[2]),
str(pvalues_array[3]),
str(pvalues_array[4]),
str(pvalues_array[5]),
str(pvalues_array[6]),
var_info_list]
out_handle.write('\t'.join(print_arr) + '\n')
def weighted_fdr_correction(qval_array, fdrs_to_report, num_genes):
"""
Perform Benjamini-Hochberg algorithm on Q-values for weighted FDR procedure
For details, see Genovese et al., 2006 (https://www.jstor.org/stable/20441304)
:param qval_array: array of unique Q-values
:param fdrs_to_report: FDR limits at which to report output, specified by user, default 5% and 10%
:param num_genes: number of genes processed
:return: the Q-value threshold corresponding to each FDR threshold specified in "fdrs_to_report"
"""
qval_array = sorted(qval_array)
qval_array_length = len(qval_array)
fdr_cutoff_to_qval_threshold = {fdr: 0 for fdr in fdrs_to_report}
# For each FDR cutoff to report, compute the corresponding Q-value threshold
for i in range(qval_array_length):
for fdr in fdrs_to_report:
qval_threshold = fdr * (i + 1) / num_genes
if qval_array[i] < qval_threshold:
fdr_cutoff_to_qval_threshold[fdr] = qval_threshold
return fdr_cutoff_to_qval_threshold
def FDR_procedure(outfile_mask, fdrs_to_report, num_genes):
"""
:param outfile_mask: outfile file prefix, specified by user using the --o parameter
:param fdrs_to_report: FDR limits at which to report output, specified by user, default 5% and 10%
:param num_genes: number of genes processed
:return: None, but write out intermediate file used to perform the weighted FDR procedure on P-values
"""
outfile_dict = {} # qvalue -> output string; used to produce output sorted by qvalue
qval_array = []
filename = f"{outfile_mask}_{cfg.DN_result}_tmp.txt"
with open(filename, 'r') as inh:
header = None
for output_str in inh:
if output_str.startswith('#'):
continue
if not header:
header = {k: i for i, k in enumerate(output_str.strip().split())}
header_line = output_str.strip().split()
continue
output_str = output_str.strip().split('\t')
qval = eval(output_str[header["Q_val"]])
qval_array.append(qval)
if not outfile_dict.get(qval):
outfile_dict[qval] = []
outfile_dict[qval].append(output_str)
fdr_to_qval_threshold = weighted_fdr_correction(qval_array, fdrs_to_report, num_genes)
fdr_column_names = '\t'.join([f"FDR_{p}" for p in fdr_to_qval_threshold])
# Final output file
filename = f"{outfile_mask}_{cfg.DN_result}.txt"
with open(filename, 'w') as outh:
# write out original header plus additional FDR-relevant columns
hs = '\t'.join(header_line)
outh.write(f"{hs}\t{fdr_column_names}\n")
for qval in sorted(outfile_dict.keys()):
fdr_results_array = []
# Using Q-value thresholds from weighted_fdr_correction
for FDR in fdr_to_qval_threshold:
# FDR pass/ not pass is added to the output
if qval < fdr_to_qval_threshold[FDR]:
fdr_results_array.append("True")
else:
fdr_results_array.append("False")
concatenated_fdr_results = '\t'.join(fdr_results_array)
for file_str_arr in outfile_dict[qval]:
fsa = '\t'.join(file_str_arr)
outh.write(f"{fsa}\t{concatenated_fdr_results}\n")
print(f"Results written to {filename}")
def calc_denovo_stat(ensg_to_y_stat,
varcount_dict,
Gene_inst_dict,
outfile_mask,
num_samples,
gene_mutdict,
fdrs_to_report_str,
gene_constraint_score_name,
num_genes):
"""
Master function for RaMeDiES-DN; based on input variant information OR supplied metadata, calculates
a de novo recurrence p-value per gene, performs a weighted FDR procedure using gene constraint weights,
and writes results to file.
:param ensg_to_y_stat: dictionary of Ensembl gene ID -> (y statistic, count_y output)