Skip to content

Commit 563e8ea

Browse files
committed
Fix docstring and comment lengths in ""src/ctxcore/aucell.py".
Fix docstring and comment lengths in ""src/ctxcore/aucell.py".
1 parent 422164a commit 563e8ea

1 file changed

Lines changed: 43 additions & 34 deletions

File tree

src/ctxcore/aucell.py

Lines changed: 43 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,9 @@
1515
from ctxcore.recovery import enrichment4cells
1616

1717
LOGGER = logging.getLogger(__name__)
18-
# To reduce the memory footprint of a ranking matrix we use unsigned 32bit integers which provides a range from 0
19-
# through 4,294,967,295. This should be sufficient even for region-based approaches.
18+
# To reduce the memory footprint of a ranking matrix we use unsigned 32bit integers
19+
# which provides a range from 0 through 4,294,967,295. This should be sufficient even
20+
# for region-based approaches.
2021
DTYPE = "uint32"
2122
DTYPE_C = c_uint32
2223

@@ -25,23 +26,28 @@ def create_rankings(ex_mtx: pd.DataFrame, seed=None) -> pd.DataFrame:
2526
"""
2627
Create a whole genome rankings dataframe from a single cell expression profile dataframe.
2728
28-
:param ex_mtx: The expression profile matrix. The rows should correspond to different cells, the columns to different
29-
genes (n_cells x n_genes).
29+
:param ex_mtx: The expression profile matrix. The rows should correspond to
30+
different cells, the columns to different genes (n_cells x n_genes).
3031
:return: A genome rankings dataframe (n_cells x n_genes).
3132
"""
3233
# Do a shuffle would be nice for exactly similar behaviour as R implementation.
3334
# 1. Ranks are assigned in the range of 1 to n, therefore we need to subtract 1.
34-
# 2. In case of a tie the 'first' method is used, i.e. we keep the order in the original array. The remove any
35-
# bias we shuffle the dataframe before ranking it. This introduces a performance penalty!
36-
# 3. Genes are ranked according to gene expression in descending order, i.e. from highly expressed (0) to low expression (n).
37-
# 3. NAs should be given the highest rank numbers. Documentation is bad, so tested implementation via code snippet:
35+
# 2. In case of a tie the 'first' method is used, i.e. we keep the order in the
36+
# original array. The remove any bias we shuffle the dataframe before ranking
37+
# it. This introduces a performance penalty!
38+
# 3. Genes are ranked according to gene expression in descending order,
39+
# i.e. from highly expressed (0) to low expression (n).
40+
# 3. NAs should be given the highest rank numbers. Documentation is bad, so tested
41+
# implementation via code snippet:
3842
#
3943
# import pandas as pd
4044
# import numpy as np
4145
# df = pd.DataFrame(data=[4, 1, 3, np.nan, 2, 3], columns=['values'])
42-
# # Run below statement multiple times to see effect of shuffling in case of a tie.
43-
# df.sample(frac=1.0, replace=False).rank(ascending=False, method='first', na_option='bottom').sort_index() - 1
44-
#
46+
# # Run below statement multiple times to see effect of shuffling in case of a
47+
# # tie.
48+
# >>> df.sample(frac=1.0, replace=False).rank(
49+
# ... ascending=False, method='first', na_option='bottom'
50+
# ... ).sort_index() - 1
4551
return (
4652
ex_mtx.sample(frac=1.0, replace=False, axis=1, random_state=seed)
4753
.rank(axis=1, ascending=False, method="first", na_option="bottom")
@@ -50,15 +56,11 @@ def create_rankings(ex_mtx: pd.DataFrame, seed=None) -> pd.DataFrame:
5056
)
5157

5258

53-
# In [75]: %time create_rankings_df = (exp_matrix.sample(frac=1.0, replace=False, axis=1, random_state
54-
# ...: =1)[exp_matrix.columns].rank(axis=1, ascending=False, method="first", na_option="bottom").a
55-
# ...: stype(DTYPE) - 1)
59+
# In [75]: %time create_rankings_df = (exp_matrix.sample(frac=1.0, replace=False, axis=1, random_state=1)[exp_matrix.columns].rank(axis=1, ascending=False, method="first", na_option="bottom").astype(DTYPE) - 1)
5660
# CPU times: user 151 ms, sys: 4.02 ms, total: 155 ms
5761
# Wall time: 154 ms
5862
#
59-
# In [76]: %time cr_pl = pl.from_pandas(exp_matrix.T, include_index=True).select(pl.col(pl.Utf8), pl.c
60-
# ...: ol(pl.Float64).fill_nan(None).rank(method="random", seed=5, descending=True,).add(-1).cast(
61-
# ...: pl.UInt32))
63+
# In [76]: %time cr_pl = pl.from_pandas(exp_matrix.T, include_index=True).select(pl.col(pl.Utf8), pl.col(pl.Float64).fill_nan(None).rank(method="random", seed=5, descending=True,).add(-1).cast(pl.UInt32))
6264
# CPU times: user 193 ms, sys: 13.8 ms, total: 207 ms
6365
# Wall time: 52.2 ms
6466

@@ -67,13 +69,14 @@ def derive_auc_threshold(ex_mtx: pd.DataFrame) -> pd.DataFrame:
6769
"""
6870
Derive AUC thresholds for an expression matrix.
6971
70-
It is important to check that most cells have a substantial fraction of expressed/detected genes in the calculation of
71-
the AUC.
72+
It is important to check that most cells have a substantial fraction of
73+
expressed/detected genes in the calculation of the AUC.
7274
73-
:param ex_mtx: The expression profile matrix. The rows should correspond to different cells, the columns to different
74-
genes (n_cells x n_genes).
75-
:return: A dataframe with AUC threshold for different quantiles over the number cells: a fraction of 0.01 designates
76-
that when using this value as the AUC threshold for 99% of the cells all ranked genes used for AUC calculation will
75+
:param ex_mtx: The expression profile matrix. The rows should correspond to
76+
different cells, the columns to different genes (n_cells x n_genes).
77+
:return: A dataframe with AUC threshold for different quantiles over the number
78+
cells: a fraction of 0.01 designates that when using this value as the AUC
79+
threshold for 99% of the cells all ranked genes used for AUC calculation will
7780
have had a detected expression in the single-cell experiment.
7881
"""
7982
return (
@@ -98,7 +101,8 @@ def _enrichment(
98101
columns=genes,
99102
index=cells,
100103
)
101-
# To avoid additional memory burden de resulting AUCs are immediately stored in the output sync. array.
104+
# To avoid additional memory burden de resulting AUCs are immediately stored in the
105+
# output array.
102106
result_mtx = np.frombuffer(auc_mtx.get_obj(), dtype="d")
103107
inc = len(cells)
104108
for idx, module in enumerate(modules):
@@ -120,9 +124,10 @@ def aucell4r(
120124
121125
:param df_rnk: The rank matrix (n_cells x n_genes).
122126
:param signatures: The gene signatures or regulons.
123-
:param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the
124-
Area Under the recovery Curve.
125-
:param noweights: Should the weights of the genes part of a signature be used in calculation of enrichment?
127+
:param auc_threshold: The fraction of the ranked genome to take into account for
128+
the calculation of the Area Under the recovery Curve.
129+
:param noweights: Should the weights of the genes part of a signature be used in
130+
calculation of enrichment?
126131
:param normalize: Normalize the AUC values to a maximum of 1.0 per regulon.
127132
:param num_workers: The number of cores to use.
128133
:return: A dataframe with the AUCs (n_cells x n_modules).
@@ -141,15 +146,18 @@ def aucell4r(
141146
).unstack("Regulon")
142147
aucs.columns = aucs.columns.droplevel(0)
143148
else:
144-
# Decompose the rankings dataframe: the index and columns are shared with the child processes via pickling.
149+
# Decompose the rankings dataframe: the index and columns are shared with the
150+
# child processes via pickling.
145151
genes = df_rnk.columns.values
146152
cells = df_rnk.index.values
147-
# The actual rankings are shared directly. This is possible because during a fork from a parent process the child
148-
# process inherits the memory of the parent process. A RawArray is used instead of a synchronize Array because
153+
# The actual rankings are shared directly. This is possible because during a
154+
# fork from a parent process the child process inherits the memory of the
155+
# parent process. A RawArray is used instead of a synchronize Array because
149156
# these rankings are read-only.
150157
shared_ro_memory_array = RawArray(DTYPE_C, mul(*df_rnk.shape))
151158
array = np.frombuffer(shared_ro_memory_array, dtype=DTYPE)
152-
# Copy the contents of df_rank into this shared memory block using row-major ordering.
159+
# Copy the contents of df_rank into this shared memory block using row-major
160+
# ordering.
153161
array[:] = df_rnk.values.ravel(order="C")
154162

155163
# The resulting AUCs are returned via a synchronize array.
@@ -208,9 +216,10 @@ def aucell(
208216
209217
:param exp_mtx: The expression matrix (n_cells x n_genes).
210218
:param signatures: The gene signatures or regulons.
211-
:param auc_threshold: The fraction of the ranked genome to take into account for the calculation of the
212-
Area Under the recovery Curve.
213-
:param noweights: Should the weights of the genes part of a signature be used in calculation of enrichment?
219+
:param auc_threshold: The fraction of the ranked genome to take into account for
220+
the calculation of the Area Under the recovery Curve.
221+
:param noweights: Should the weights of the genes part of a signature be used in
222+
calculation of enrichment?
214223
:param normalize: Normalize the AUC values to a maximum of 1.0 per regulon.
215224
:param num_workers: The number of cores to use.
216225
:return: A dataframe with the AUCs (n_cells x n_modules).

0 commit comments

Comments
 (0)