Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/release-notes/0.15.1.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### 0.15.1 {small}`the-future`

```{rubric} Bug fixes
```
* Fixes `tl.rank_genes_groups` returning NaN/zero `logfoldchanges`/`pvals` with `groups=[subset]` and `reference='rest'` {pr}`651` {smaller}`S Dicks`
2 changes: 2 additions & 0 deletions docs/release-notes/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@


## Version 0.15.0
```{include} /release-notes/0.15.1.md
```
```{include} /release-notes/0.15.0.md
```

Expand Down
30 changes: 21 additions & 9 deletions src/rapids_singlecell/tools/_rank_genes_groups/_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,33 +174,45 @@ def _basic_stats(self) -> None:
cat_to_idx = {str(name): i for i, name in enumerate(cat_names)}
order = [cat_to_idx[str(name)] for name in self.groups_order]

# Aggregate returns stats per ALL categories. Slice to selected groups
# for per-group means/vars; keep the all-category arrays for "rest"
# stats so the totals stay correct when ``groups`` is a strict subset.
sums_all = result["sum"]
sq_sums_all = result["sq_sum"]
nnz_all = result["count_nonzero"] if self.comp_pts else None

n = cp.asarray(self.group_sizes, dtype=cp.float64)[:, None]
sums = result["sum"][order]
sq_sums = result["sq_sum"][order]
sums = sums_all[order]
sq_sums = sq_sums_all[order]

# Compute means and variances from raw sums (all on GPU)
means = sums / n
group_ss = sq_sums - n * means**2
vars_ = cp.maximum(group_ss / cp.maximum(n - 1, 1), 0)

if self.comp_pts:
pts = result["count_nonzero"][order].astype(cp.float64) / n
pts = nnz_all[order].astype(cp.float64) / n
else:
pts = None

# Compute rest statistics if reference='rest'
# Compute rest statistics if reference='rest' — "rest" means every
# cell in ``groupby`` not in this group, including cells in
# categories that weren't selected via ``groups=``.
if self.ireference is None:
n_rest = n.sum() - n
means_rest = (sums.sum(axis=0) - sums) / n_rest
rest_ss = (sq_sums.sum(axis=0) - sq_sums) - n_rest * means_rest**2
n_total = agg.n_cells.sum()
n_rest = n_total - n
means_rest = (sums_all.sum(axis=0) - sums) / n_rest
rest_ss = (sq_sums_all.sum(axis=0) - sq_sums) - n_rest * means_rest**2
vars_rest = cp.maximum(rest_ss / cp.maximum(n_rest - 1, 1), 0)

self.means_rest = cp.asnumpy(means_rest)
self.vars_rest = cp.asnumpy(vars_rest)

if self.comp_pts:
total_count = (pts * n).sum(axis=0)
self.pts_rest = cp.asnumpy((total_count - pts * n) / n_rest)
nnz_total = nnz_all.sum(axis=0)
self.pts_rest = cp.asnumpy(
(nnz_total - nnz_all[order]).astype(cp.float64) / n_rest
)
else:
self.pts_rest = None
else:
Expand Down
57 changes: 57 additions & 0 deletions tests/test_rank_genes_groups_ttest.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,63 @@ def test_rank_genes_groups_ttest_subset_and_bonferroni(reference, method):
assert np.all(adjusted <= 1.0)


@pytest.mark.parametrize(
("groups", "reference"),
[
(["0"], "rest"),
(["0", "2"], "rest"),
(["0"], "1"),
(["0", "2"], "1"),
],
)
@pytest.mark.parametrize("method", ["t-test", "t-test_overestim_var"])
def test_rank_genes_groups_ttest_subset_matches_scanpy(method, groups, reference):
np.random.seed(42)
adata_gpu = sc.datasets.blobs(n_variables=8, n_centers=5, n_observations=200)
adata_gpu.obs["blobs"] = adata_gpu.obs["blobs"].astype("category")
adata_cpu = adata_gpu.copy()

rsc.tl.rank_genes_groups(
adata_gpu,
"blobs",
method=method,
groups=groups,
reference=reference,
use_raw=False,
)
sc.tl.rank_genes_groups(
adata_cpu,
"blobs",
method=method,
groups=groups,
reference=reference,
use_raw=False,
)

gpu_result = adata_gpu.uns["rank_genes_groups"]
cpu_result = adata_cpu.uns["rank_genes_groups"]

assert gpu_result["names"].dtype.names == cpu_result["names"].dtype.names
for group in gpu_result["names"].dtype.names:
gpu_names = list(gpu_result["names"][group])
cpu_names = list(cpu_result["names"][group])
for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"):
gpu_map = dict(
zip(gpu_names, np.asarray(gpu_result[field][group], dtype=float))
)
cpu_map = dict(
zip(cpu_names, np.asarray(cpu_result[field][group], dtype=float))
)
for gene, gpu_val in gpu_map.items():
np.testing.assert_allclose(
gpu_val,
cpu_map[gene],
rtol=1e-6,
atol=1e-8,
err_msg=f"{field} mismatch for gene {gene} group {group}",
)


@pytest.mark.parametrize(
"reference_before,reference_after",
[("rest", "rest"), ("1", "One")],
Expand Down
63 changes: 63 additions & 0 deletions tests/test_rank_genes_groups_wilcoxon.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,69 @@ def test_rank_genes_groups_wilcoxon_subset_and_bonferroni(reference):
assert np.all(adjusted <= 1.0)


@pytest.mark.parametrize(
("groups", "reference"),
[
(["0"], "rest"),
(["0", "2"], "rest"),
(["0"], "1"),
(["0", "2"], "1"),
],
)
@pytest.mark.parametrize("tie_correct", [False, True])
@pytest.mark.parametrize("pre_load", [False, True])
def test_rank_genes_groups_wilcoxon_subset_matches_scanpy(
groups, reference, tie_correct, pre_load
):
np.random.seed(42)
adata_gpu = sc.datasets.blobs(n_variables=8, n_centers=5, n_observations=200)
adata_gpu.obs["blobs"] = adata_gpu.obs["blobs"].astype("category")
adata_cpu = adata_gpu.copy()

rsc.tl.rank_genes_groups(
adata_gpu,
"blobs",
method="wilcoxon",
groups=groups,
reference=reference,
use_raw=False,
tie_correct=tie_correct,
pre_load=pre_load,
)
sc.tl.rank_genes_groups(
adata_cpu,
"blobs",
method="wilcoxon",
groups=groups,
reference=reference,
use_raw=False,
tie_correct=tie_correct,
)

gpu_result = adata_gpu.uns["rank_genes_groups"]
cpu_result = adata_cpu.uns["rank_genes_groups"]

assert gpu_result["names"].dtype.names == cpu_result["names"].dtype.names
for group in gpu_result["names"].dtype.names:
gpu_names = list(gpu_result["names"][group])
cpu_names = list(cpu_result["names"][group])
for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"):
gpu_map = dict(
zip(gpu_names, np.asarray(gpu_result[field][group], dtype=float))
)
cpu_map = dict(
zip(cpu_names, np.asarray(cpu_result[field][group], dtype=float))
)
for gene, gpu_val in gpu_map.items():
np.testing.assert_allclose(
gpu_val,
cpu_map[gene],
rtol=1e-6,
atol=1e-8,
err_msg=f"{field} mismatch for gene {gene} group {group}",
)


@pytest.mark.parametrize(
"reference_before,reference_after",
[("rest", "rest"), ("1", "One")],
Expand Down
16 changes: 13 additions & 3 deletions tests/test_rank_genes_groups_wilcoxon_binned.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,9 @@ def test_reference_matches_exact(self, adata_blobs, reference, groups):
@pytest.mark.parametrize(
("reference", "groups"),
[
pytest.param("rest", ["0"], id="rest_single_group"),
pytest.param("rest", ["0", "2"], id="rest_group_subset"),
pytest.param("1", ["0"], id="ref_single_group"),
pytest.param("1", ["0", "1", "2"], id="ref_group_subset"),
],
)
Expand Down Expand Up @@ -300,9 +302,17 @@ def test_group_subset_matches_all_groups(self, adata_blobs, reference, groups):
)

for group in result_sub["names"].dtype.names:
scores_all = np.asarray(result_all["scores"][group], dtype=float)
scores_sub = np.asarray(result_sub["scores"][group], dtype=float)
np.testing.assert_allclose(scores_all, scores_sub, rtol=1e-10)
assert tuple(result_all["names"][group]) == tuple(
result_sub["names"][group]
), f"names mismatch for group {group}"
for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"):
np.testing.assert_allclose(
np.asarray(result_all[field][group], dtype=float),
np.asarray(result_sub[field][group], dtype=float),
rtol=1e-10,
atol=1e-12,
err_msg=f"{field} mismatch for group {group}",
)

@pytest.mark.parametrize("reference", ["rest", "1"])
def test_unsorted_groups(self, adata_blobs, reference):
Expand Down
Loading