1+ import pandas
2+ from functions import list_filter , compute_continuous , compute_categorical
3+
4+ #%%
5+
6+ def compute (data ,
7+ group ,
8+ continuous = [],
9+ categorical = [],
10+ skewed = [],
11+ weights = None ,
12+ decimals = 2 ,
13+ intervals = None ):
14+
15+ """
16+
17+ Computes SDs for all specified variables
18+
19+ Parameters:
20+ data (dataframe): Pandas DataFrame containing observations (rows) and variables (columns)
21+ exposure (str): Variable defining exposed and unexposed
22+ continuous (list): List of string items which are names of the continuous variables for which the SD should be computed
23+ categorical (list): List of string items which are names of the categorical variables for which the SD should be computed
24+ skewed (list): List of string items which are names of the continuous variables which have a skewed distribution (ranked SD computed)
25+ weights (None or str): Variable defining weights for each observation (otherwise assumed to be equally weighted)
26+ decimals (int): Number of decimal places which should be computed
27+ intervals (None or float): Whether CIs should be computed and with what coverage e.g. for 95% CI, intervals = 0.95
28+
29+ Returns:
30+ Returns a Pandas DataFrame containing the computed SDs (plus CIs, if specified)
31+
32+ """
33+
34+ # Asserting input types
35+
36+ assert type (data ) == pandas .DataFrame or type (data ) == pandas .core .frame .DataFrame , "Data must be specified as a Pandas DataFrame"
37+ assert type (group ) == str , "Group variable must be specified as a string"
38+ assert type (continuous ) == list and type (categorical ) == list and type (skewed ) == list , "Variable names must be specified inside lists"
39+ assert weights == None or type (weights ) == str , "If weight variable is present, it must be specified as a string"
40+ assert type (decimals ) == int , "Number of decimal places must be specified as an integer"
41+ assert intervals == None or (intervals > 0 and intervals < 1 ), "CIs must be specified as None or in range (0,1) e.g. for 95% CI, intervals = 0.95"
42+
43+ # Get combined list of variables and sort them into the order in which they appear in the dataframe
44+
45+ specified_variables = (continuous + categorical ).copy ()
46+ all_variables = list (data )
47+
48+ for variable in specified_variables :
49+ assert type (variable ) == str , "The variable names inside lists must all be specified as strings"
50+ if variable not in all_variables :
51+ print ("The following variable was not computed as it could not be found in dataframe columns:" , variable )
52+
53+ ordered_variables = list_filter (list1 = all_variables , list2 = specified_variables )
54+
55+ # Computing the standardized difference
56+
57+ results = []
58+
59+ for variable in ordered_variables :
60+
61+ if variable in continuous :
62+
63+ if variable in skewed :
64+
65+ stdiff = compute_continuous (data = data ,
66+ group = group ,
67+ variable = variable ,
68+ skewed = True ,
69+ weights = weights ,
70+ decimals = decimals ,
71+ intervals = intervals )
72+
73+ results .append (stdiff )
74+
75+ else :
76+
77+ stdiff = compute_continuous (data = data ,
78+ group = group ,
79+ variable = variable ,
80+ skewed = False ,
81+ weights = weights ,
82+ decimals = decimals ,
83+ intervals = intervals )
84+
85+ results .append (stdiff )
86+
87+ else :
88+
89+ stdiff = compute_categorical (data = data ,
90+ group = group ,
91+ variable = variable ,
92+ weights = weights ,
93+ decimals = decimals ,
94+ intervals = intervals )
95+
96+ results .append (stdiff )
97+
98+ results = pandas .DataFrame (data = results )
99+ results .set_axis ([ordered_variables ], axis = 0 , inplace = True )
100+
101+ # Computing the CIs
102+
103+ if intervals == None :
104+
105+ results .set_axis (['ES' ], axis = 1 , inplace = True )
106+
107+ else :
108+
109+ ci_label = round (( intervals * 100 ), ndigits = 2 )
110+ results .set_axis (['ES' , str (ci_label ) + '% CI' ], axis = 1 , inplace = True )
111+
112+ return results
0 commit comments