1515from ctxcore .recovery import enrichment4cells
1616
1717LOGGER = logging .getLogger (__name__ )
18- # To reduce the memory footprint of a ranking matrix we use unsigned 32bit integers which provides a range from 0
19- # through 4,294,967,295. This should be sufficient even for region-based approaches.
18+ # To reduce the memory footprint of a ranking matrix we use unsigned 32bit integers
19+ # which provides a range from 0 through 4,294,967,295. This should be sufficient even
20+ # for region-based approaches.
2021DTYPE = "uint32"
2122DTYPE_C = c_uint32
2223
@@ -25,23 +26,28 @@ def create_rankings(ex_mtx: pd.DataFrame, seed=None) -> pd.DataFrame:
2526 """
2627 Create a whole genome rankings dataframe from a single cell expression profile dataframe.
2728
28- :param ex_mtx: The expression profile matrix. The rows should correspond to different cells, the columns to different
29- genes (n_cells x n_genes).
29+ :param ex_mtx: The expression profile matrix. The rows should correspond to
30+ different cells, the columns to different genes (n_cells x n_genes).
3031 :return: A genome rankings dataframe (n_cells x n_genes).
3132 """
3233 # Do a shuffle would be nice for exactly similar behaviour as R implementation.
3334 # 1. Ranks are assigned in the range of 1 to n, therefore we need to subtract 1.
34- # 2. In case of a tie the 'first' method is used, i.e. we keep the order in the original array. The remove any
35- # bias we shuffle the dataframe before ranking it. This introduces a performance penalty!
36- # 3. Genes are ranked according to gene expression in descending order, i.e. from highly expressed (0) to low expression (n).
37- # 3. NAs should be given the highest rank numbers. Documentation is bad, so tested implementation via code snippet:
35+ # 2. In case of a tie the 'first' method is used, i.e. we keep the order in the
36+ # original array. The remove any bias we shuffle the dataframe before ranking
37+ # it. This introduces a performance penalty!
38+ # 3. Genes are ranked according to gene expression in descending order,
39+ # i.e. from highly expressed (0) to low expression (n).
40+ # 3. NAs should be given the highest rank numbers. Documentation is bad, so tested
41+ # implementation via code snippet:
3842 #
3943 # import pandas as pd
4044 # import numpy as np
4145 # df = pd.DataFrame(data=[4, 1, 3, np.nan, 2, 3], columns=['values'])
42- # # Run below statement multiple times to see effect of shuffling in case of a tie.
43- # df.sample(frac=1.0, replace=False).rank(ascending=False, method='first', na_option='bottom').sort_index() - 1
44- #
46+ # # Run below statement multiple times to see effect of shuffling in case of a
47+ # # tie.
48+ # >>> df.sample(frac=1.0, replace=False).rank(
49+ # ... ascending=False, method='first', na_option='bottom'
50+ # ... ).sort_index() - 1
4551 return (
4652 ex_mtx .sample (frac = 1.0 , replace = False , axis = 1 , random_state = seed )
4753 .rank (axis = 1 , ascending = False , method = "first" , na_option = "bottom" )
@@ -50,15 +56,11 @@ def create_rankings(ex_mtx: pd.DataFrame, seed=None) -> pd.DataFrame:
5056 )
5157
5258
53- # In [75]: %time create_rankings_df = (exp_matrix.sample(frac=1.0, replace=False, axis=1, random_state
54- # ...: =1)[exp_matrix.columns].rank(axis=1, ascending=False, method="first", na_option="bottom").a
55- # ...: stype(DTYPE) - 1)
59+ # In [75]: %time create_rankings_df = (exp_matrix.sample(frac=1.0, replace=False, axis=1, random_state=1)[exp_matrix.columns].rank(axis=1, ascending=False, method="first", na_option="bottom").astype(DTYPE) - 1)
5660# CPU times: user 151 ms, sys: 4.02 ms, total: 155 ms
5761# Wall time: 154 ms
5862#
59- # In [76]: %time cr_pl = pl.from_pandas(exp_matrix.T, include_index=True).select(pl.col(pl.Utf8), pl.c
60- # ...: ol(pl.Float64).fill_nan(None).rank(method="random", seed=5, descending=True,).add(-1).cast(
61- # ...: pl.UInt32))
63+ # In [76]: %time cr_pl = pl.from_pandas(exp_matrix.T, include_index=True).select(pl.col(pl.Utf8), pl.col(pl.Float64).fill_nan(None).rank(method="random", seed=5, descending=True,).add(-1).cast(pl.UInt32))
6264# CPU times: user 193 ms, sys: 13.8 ms, total: 207 ms
6365# Wall time: 52.2 ms
6466
@@ -67,13 +69,14 @@ def derive_auc_threshold(ex_mtx: pd.DataFrame) -> pd.DataFrame:
6769 """
6870 Derive AUC thresholds for an expression matrix.
6971
70- It is important to check that most cells have a substantial fraction of expressed/detected genes in the calculation of
71- the AUC.
72+ It is important to check that most cells have a substantial fraction of
73+ expressed/detected genes in the calculation of the AUC.
7274
73- :param ex_mtx: The expression profile matrix. The rows should correspond to different cells, the columns to different
74- genes (n_cells x n_genes).
75- :return: A dataframe with AUC threshold for different quantiles over the number cells: a fraction of 0.01 designates
76- that when using this value as the AUC threshold for 99% of the cells all ranked genes used for AUC calculation will
75+ :param ex_mtx: The expression profile matrix. The rows should correspond to
76+ different cells, the columns to different genes (n_cells x n_genes).
77+ :return: A dataframe with AUC threshold for different quantiles over the number
78+ cells: a fraction of 0.01 designates that when using this value as the AUC
79+ threshold for 99% of the cells all ranked genes used for AUC calculation will
7780 have had a detected expression in the single-cell experiment.
7881 """
7982 return (
@@ -98,7 +101,8 @@ def _enrichment(
98101 columns = genes ,
99102 index = cells ,
100103 )
101- # To avoid additional memory burden de resulting AUCs are immediately stored in the output sync. array.
104+ # To avoid additional memory burden de resulting AUCs are immediately stored in the
105+ # output array.
102106 result_mtx = np .frombuffer (auc_mtx .get_obj (), dtype = "d" )
103107 inc = len (cells )
104108 for idx , module in enumerate (modules ):
@@ -120,9 +124,10 @@ def aucell4r(
120124
121125 :param df_rnk: The rank matrix (n_cells x n_genes).
122126 :param signatures: The gene signatures or regulons.
123- :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the
124- Area Under the recovery Curve.
125- :param noweights: Should the weights of the genes part of a signature be used in calculation of enrichment?
127+ :param auc_threshold: The fraction of the ranked genome to take into account for
128+ the calculation of the Area Under the recovery Curve.
129+ :param noweights: Should the weights of the genes part of a signature be used in
130+ calculation of enrichment?
126131 :param normalize: Normalize the AUC values to a maximum of 1.0 per regulon.
127132 :param num_workers: The number of cores to use.
128133 :return: A dataframe with the AUCs (n_cells x n_modules).
@@ -141,15 +146,18 @@ def aucell4r(
141146 ).unstack ("Regulon" )
142147 aucs .columns = aucs .columns .droplevel (0 )
143148 else :
144- # Decompose the rankings dataframe: the index and columns are shared with the child processes via pickling.
149+ # Decompose the rankings dataframe: the index and columns are shared with the
150+ # child processes via pickling.
145151 genes = df_rnk .columns .values
146152 cells = df_rnk .index .values
147- # The actual rankings are shared directly. This is possible because during a fork from a parent process the child
148- # process inherits the memory of the parent process. A RawArray is used instead of a synchronize Array because
153+ # The actual rankings are shared directly. This is possible because during a
154+ # fork from a parent process the child process inherits the memory of the
155+ # parent process. A RawArray is used instead of a synchronize Array because
149156 # these rankings are read-only.
150157 shared_ro_memory_array = RawArray (DTYPE_C , mul (* df_rnk .shape ))
151158 array = np .frombuffer (shared_ro_memory_array , dtype = DTYPE )
152- # Copy the contents of df_rank into this shared memory block using row-major ordering.
159+ # Copy the contents of df_rank into this shared memory block using row-major
160+ # ordering.
153161 array [:] = df_rnk .values .ravel (order = "C" )
154162
155163 # The resulting AUCs are returned via a synchronize array.
@@ -208,9 +216,10 @@ def aucell(
208216
209217 :param exp_mtx: The expression matrix (n_cells x n_genes).
210218 :param signatures: The gene signatures or regulons.
211- :param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the
212- Area Under the recovery Curve.
213- :param noweights: Should the weights of the genes part of a signature be used in calculation of enrichment?
219+ :param auc_threshold: The fraction of the ranked genome to take into account for
220+ the calculation of the Area Under the recovery Curve.
221+ :param noweights: Should the weights of the genes part of a signature be used in
222+ calculation of enrichment?
214223 :param normalize: Normalize the AUC values to a maximum of 1.0 per regulon.
215224 :param num_workers: The number of cores to use.
216225 :return: A dataframe with the AUCs (n_cells x n_modules).
0 commit comments