diff --git a/docs/release-notes/0.15.1.md b/docs/release-notes/0.15.1.md new file mode 100644 index 00000000..d5b484a2 --- /dev/null +++ b/docs/release-notes/0.15.1.md @@ -0,0 +1,5 @@ +### 0.15.1 {small}`the-future` + +```{rubric} Bug fixes +``` +* Fixes `tl.rank_genes_groups` returning NaN/zero `logfoldchanges`/`pvals` with `groups=[subset]` and `reference='rest'` {pr}`651` {smaller}`S Dicks` diff --git a/docs/release-notes/index.md b/docs/release-notes/index.md index 5aec4361..36b76ea6 100644 --- a/docs/release-notes/index.md +++ b/docs/release-notes/index.md @@ -4,6 +4,8 @@ ## Version 0.15.0 +```{include} /release-notes/0.15.1.md +``` ```{include} /release-notes/0.15.0.md ``` diff --git a/src/rapids_singlecell/tools/_rank_genes_groups/_core.py b/src/rapids_singlecell/tools/_rank_genes_groups/_core.py index c65bbf7c..019e126b 100644 --- a/src/rapids_singlecell/tools/_rank_genes_groups/_core.py +++ b/src/rapids_singlecell/tools/_rank_genes_groups/_core.py @@ -174,9 +174,16 @@ def _basic_stats(self) -> None: cat_to_idx = {str(name): i for i, name in enumerate(cat_names)} order = [cat_to_idx[str(name)] for name in self.groups_order] + # Aggregate returns stats per ALL categories. Slice to selected groups + # for per-group means/vars; keep the all-category arrays for "rest" + # stats so the totals stay correct when ``groups`` is a strict subset. + sums_all = result["sum"] + sq_sums_all = result["sq_sum"] + nnz_all = result["count_nonzero"] if self.comp_pts else None + n = cp.asarray(self.group_sizes, dtype=cp.float64)[:, None] - sums = result["sum"][order] - sq_sums = result["sq_sum"][order] + sums = sums_all[order] + sq_sums = sq_sums_all[order] # Compute means and variances from raw sums (all on GPU) means = sums / n @@ -184,23 +191,28 @@ def _basic_stats(self) -> None: vars_ = cp.maximum(group_ss / cp.maximum(n - 1, 1), 0) if self.comp_pts: - pts = result["count_nonzero"][order].astype(cp.float64) / n + pts = nnz_all[order].astype(cp.float64) / n else: pts = None - # Compute rest statistics if reference='rest' + # Compute rest statistics if reference='rest' — "rest" means every + # cell in ``groupby`` not in this group, including cells in + # categories that weren't selected via ``groups=``. if self.ireference is None: - n_rest = n.sum() - n - means_rest = (sums.sum(axis=0) - sums) / n_rest - rest_ss = (sq_sums.sum(axis=0) - sq_sums) - n_rest * means_rest**2 + n_total = agg.n_cells.sum() + n_rest = n_total - n + means_rest = (sums_all.sum(axis=0) - sums) / n_rest + rest_ss = (sq_sums_all.sum(axis=0) - sq_sums) - n_rest * means_rest**2 vars_rest = cp.maximum(rest_ss / cp.maximum(n_rest - 1, 1), 0) self.means_rest = cp.asnumpy(means_rest) self.vars_rest = cp.asnumpy(vars_rest) if self.comp_pts: - total_count = (pts * n).sum(axis=0) - self.pts_rest = cp.asnumpy((total_count - pts * n) / n_rest) + nnz_total = nnz_all.sum(axis=0) + self.pts_rest = cp.asnumpy( + (nnz_total - nnz_all[order]).astype(cp.float64) / n_rest + ) else: self.pts_rest = None else: diff --git a/tests/test_rank_genes_groups_ttest.py b/tests/test_rank_genes_groups_ttest.py index 8fe93ae7..e1684536 100644 --- a/tests/test_rank_genes_groups_ttest.py +++ b/tests/test_rank_genes_groups_ttest.py @@ -149,6 +149,63 @@ def test_rank_genes_groups_ttest_subset_and_bonferroni(reference, method): assert np.all(adjusted <= 1.0) +@pytest.mark.parametrize( + ("groups", "reference"), + [ + (["0"], "rest"), + (["0", "2"], "rest"), + (["0"], "1"), + (["0", "2"], "1"), + ], +) +@pytest.mark.parametrize("method", ["t-test", "t-test_overestim_var"]) +def test_rank_genes_groups_ttest_subset_matches_scanpy(method, groups, reference): + np.random.seed(42) + adata_gpu = sc.datasets.blobs(n_variables=8, n_centers=5, n_observations=200) + adata_gpu.obs["blobs"] = adata_gpu.obs["blobs"].astype("category") + adata_cpu = adata_gpu.copy() + + rsc.tl.rank_genes_groups( + adata_gpu, + "blobs", + method=method, + groups=groups, + reference=reference, + use_raw=False, + ) + sc.tl.rank_genes_groups( + adata_cpu, + "blobs", + method=method, + groups=groups, + reference=reference, + use_raw=False, + ) + + gpu_result = adata_gpu.uns["rank_genes_groups"] + cpu_result = adata_cpu.uns["rank_genes_groups"] + + assert gpu_result["names"].dtype.names == cpu_result["names"].dtype.names + for group in gpu_result["names"].dtype.names: + gpu_names = list(gpu_result["names"][group]) + cpu_names = list(cpu_result["names"][group]) + for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"): + gpu_map = dict( + zip(gpu_names, np.asarray(gpu_result[field][group], dtype=float)) + ) + cpu_map = dict( + zip(cpu_names, np.asarray(cpu_result[field][group], dtype=float)) + ) + for gene, gpu_val in gpu_map.items(): + np.testing.assert_allclose( + gpu_val, + cpu_map[gene], + rtol=1e-6, + atol=1e-8, + err_msg=f"{field} mismatch for gene {gene} group {group}", + ) + + @pytest.mark.parametrize( "reference_before,reference_after", [("rest", "rest"), ("1", "One")], diff --git a/tests/test_rank_genes_groups_wilcoxon.py b/tests/test_rank_genes_groups_wilcoxon.py index 0c6844da..7f32f0e5 100644 --- a/tests/test_rank_genes_groups_wilcoxon.py +++ b/tests/test_rank_genes_groups_wilcoxon.py @@ -148,6 +148,69 @@ def test_rank_genes_groups_wilcoxon_subset_and_bonferroni(reference): assert np.all(adjusted <= 1.0) +@pytest.mark.parametrize( + ("groups", "reference"), + [ + (["0"], "rest"), + (["0", "2"], "rest"), + (["0"], "1"), + (["0", "2"], "1"), + ], +) +@pytest.mark.parametrize("tie_correct", [False, True]) +@pytest.mark.parametrize("pre_load", [False, True]) +def test_rank_genes_groups_wilcoxon_subset_matches_scanpy( + groups, reference, tie_correct, pre_load +): + np.random.seed(42) + adata_gpu = sc.datasets.blobs(n_variables=8, n_centers=5, n_observations=200) + adata_gpu.obs["blobs"] = adata_gpu.obs["blobs"].astype("category") + adata_cpu = adata_gpu.copy() + + rsc.tl.rank_genes_groups( + adata_gpu, + "blobs", + method="wilcoxon", + groups=groups, + reference=reference, + use_raw=False, + tie_correct=tie_correct, + pre_load=pre_load, + ) + sc.tl.rank_genes_groups( + adata_cpu, + "blobs", + method="wilcoxon", + groups=groups, + reference=reference, + use_raw=False, + tie_correct=tie_correct, + ) + + gpu_result = adata_gpu.uns["rank_genes_groups"] + cpu_result = adata_cpu.uns["rank_genes_groups"] + + assert gpu_result["names"].dtype.names == cpu_result["names"].dtype.names + for group in gpu_result["names"].dtype.names: + gpu_names = list(gpu_result["names"][group]) + cpu_names = list(cpu_result["names"][group]) + for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"): + gpu_map = dict( + zip(gpu_names, np.asarray(gpu_result[field][group], dtype=float)) + ) + cpu_map = dict( + zip(cpu_names, np.asarray(cpu_result[field][group], dtype=float)) + ) + for gene, gpu_val in gpu_map.items(): + np.testing.assert_allclose( + gpu_val, + cpu_map[gene], + rtol=1e-6, + atol=1e-8, + err_msg=f"{field} mismatch for gene {gene} group {group}", + ) + + @pytest.mark.parametrize( "reference_before,reference_after", [("rest", "rest"), ("1", "One")], diff --git a/tests/test_rank_genes_groups_wilcoxon_binned.py b/tests/test_rank_genes_groups_wilcoxon_binned.py index 96af0711..85abc3e2 100644 --- a/tests/test_rank_genes_groups_wilcoxon_binned.py +++ b/tests/test_rank_genes_groups_wilcoxon_binned.py @@ -264,7 +264,9 @@ def test_reference_matches_exact(self, adata_blobs, reference, groups): @pytest.mark.parametrize( ("reference", "groups"), [ + pytest.param("rest", ["0"], id="rest_single_group"), pytest.param("rest", ["0", "2"], id="rest_group_subset"), + pytest.param("1", ["0"], id="ref_single_group"), pytest.param("1", ["0", "1", "2"], id="ref_group_subset"), ], ) @@ -300,9 +302,17 @@ def test_group_subset_matches_all_groups(self, adata_blobs, reference, groups): ) for group in result_sub["names"].dtype.names: - scores_all = np.asarray(result_all["scores"][group], dtype=float) - scores_sub = np.asarray(result_sub["scores"][group], dtype=float) - np.testing.assert_allclose(scores_all, scores_sub, rtol=1e-10) + assert tuple(result_all["names"][group]) == tuple( + result_sub["names"][group] + ), f"names mismatch for group {group}" + for field in ("scores", "logfoldchanges", "pvals", "pvals_adj"): + np.testing.assert_allclose( + np.asarray(result_all[field][group], dtype=float), + np.asarray(result_sub[field][group], dtype=float), + rtol=1e-10, + atol=1e-12, + err_msg=f"{field} mismatch for group {group}", + ) @pytest.mark.parametrize("reference", ["rest", "1"]) def test_unsorted_groups(self, adata_blobs, reference):