scverse · Intron7 · Apr 29, 2026 · Apr 29, 2026 · Apr 29, 2026 · Apr 30, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -71,6 +71,8 @@ if (RSC_BUILD_EXTENSIONS)
   add_nb_cuda_module(_qc_dask_cuda      src/rapids_singlecell/_cuda/qc_dask/qc_kernels_dask.cu)
   add_nb_cuda_module(_bbknn_cuda        src/rapids_singlecell/_cuda/bbknn/bbknn.cu)
   add_nb_cuda_module(_norm_cuda         src/rapids_singlecell/_cuda/norm/norm.cu)
+  add_nb_cuda_module(_gmm_cuda          src/rapids_singlecell/_cuda/gmm/gmm.cu)
+  target_link_libraries(_gmm_cuda PRIVATE CUDA::cublas)
   add_nb_cuda_module(_pr_cuda           src/rapids_singlecell/_cuda/pr/pr.cu)
   add_nb_cuda_module(_nn_descent_cuda   src/rapids_singlecell/_cuda/nn_descent/nn_descent.cu)
   add_nb_cuda_module(_aucell_cuda       src/rapids_singlecell/_cuda/aucell/aucell.cu)

diff --git a/docs/api/squidpy_gpu.md b/docs/api/squidpy_gpu.md
@@ -13,4 +13,5 @@
     gr.spatial_autocorr
     gr.co_occurrence
     gr.ligrec
+    gr.calculate_niche
 ```
diff --git a/docs/release-notes/0.15.1.md b/docs/release-notes/0.15.1.md
@@ -1,5 +1,9 @@
 ### 0.15.1 {small}`the-future`
 
+```{rubric} Features
+```
+* Add `rsc.gr.calculate_niche` with flavors `neighborhood`, `utag` , and `cellcharter`. Mirrors `squidpy.gr.calculate_niche` {pr}`644` {smaller}`S Dicks`
+* Add a minimal full-covariance GMM (`squidpy_gpu._gmm.gmm_fit_predict`) used by the `cellcharter` {pr}`644` {smaller}`S Dicks`
 ```{rubric} Bug fixes
 ```
 * Fixes `tl.rank_genes_groups` returning NaN/zero `logfoldchanges`/`pvals` with `groups=[subset]` and `reference='rest'` {pr}`651` {smaller}`S Dicks`
diff --git a/src/rapids_singlecell/_cuda/__init__.py b/src/rapids_singlecell/_cuda/__init__.py
@@ -20,6 +20,7 @@
     "_bbknn_cuda",
     "_cooc_cuda",
     "_edistance_cuda",
+    "_gmm_cuda",
     "_harmony_clustering_cuda",
     "_harmony_colsum_cuda",
     "_harmony_correction_batched_cuda",

diff --git a/src/rapids_singlecell/_cuda/gmm/gmm.cu b/src/rapids_singlecell/_cuda/gmm/gmm.cu
@@ -0,0 +1,300 @@
+#include <cuda_runtime.h>
+#include <math_constants.h>
+
+#include <cublas_v2.h>
+
+#include <limits>
+#include <stdexcept>
+#include <string>
+
+#include "../cublas_helpers.cuh"
+#include "../nb_types.h"
+
+#include "kernels_gmm.cuh"
+
+using namespace nb::literals;
+
+constexpr int TILE = 16;
+constexpr int E_STEP_BLOCK = 64;
+constexpr int NORMALIZE_BLOCK = 32;
+constexpr int M_STEP_CHUNK_ROWS = 1024;
+constexpr int M_STEP_CHUNKED_MIN_ROWS = 32768;
+
+static inline void cuda_check_runtime(cudaError_t err, const char* what) {
+    if (err != cudaSuccess) {
+        throw std::runtime_error(std::string(what) +
+                                 " failed: " + cudaGetErrorString(err));
+    }
+}
+
+static inline void cublas_check_status(cublasStatus_t status,
+                                       const char* what) {
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        throw std::runtime_error(std::string(what) +
+                                 " failed with cuBLAS status " +
+                                 std::to_string(static_cast<int>(status)));
+    }
+}
+
+template <typename T>
+static inline void launch_e_step(const T* X, const T* weights, const T* means,
+                                 const T* prec_chol, const T* log_det_half,
+                                 int n, int d, int K, T* log_prob, T* resp,
+                                 T* ll_per_cell, cudaStream_t stream) {
+    if (n == 0 || d == 0 || K == 0) return;
+    if (d > 64) {
+        throw std::runtime_error(
+            "_gmm_cuda.e_step supports at most 64 features");
+    }
+    {
+        size_t shmem = (size_t)(d + d * d) * sizeof(T);
+        dim3 block(E_STEP_BLOCK);
+        dim3 grid(K, (n + E_STEP_BLOCK - 1) / E_STEP_BLOCK);
+        e_step_log_prob_kernel<T><<<grid, block, shmem, stream>>>(
+            X, weights, means, prec_chol, log_det_half, n, d, K, log_prob);
+        CUDA_CHECK_LAST_ERROR(e_step_log_prob_kernel);
+    }
+    {
+        dim3 block(NORMALIZE_BLOCK);
+        dim3 grid(n);
+        e_step_normalize_kernel<T>
+            <<<grid, block, 0, stream>>>(log_prob, n, K, resp, ll_per_cell);
+        CUDA_CHECK_LAST_ERROR(e_step_normalize_kernel);
+    }
+}
+
+template <typename T>
+static inline void launch_m_step(const T* resp, const T* X, int n, int d, int K,
+                                 T reg_covar, T* weights, T* means,
+                                 T* covariances, T* workspace_N_k,
+                                 T* workspace_num, cudaStream_t stream) {
+    if (n == 0 || d == 0 || K == 0) return;
+    int tiles_d = (d + TILE - 1) / TILE;
+    T eps = std::numeric_limits<T>::epsilon();
+    if (n < M_STEP_CHUNKED_MIN_ROWS) {
+        dim3 block(TILE, TILE);
+        dim3 grid(K, tiles_d, tiles_d);
+        m_step_fused_kernel<T, TILE><<<grid, block, 0, stream>>>(
+            resp, X, n, d, K, workspace_N_k, workspace_num, covariances);
+        CUDA_CHECK_LAST_ERROR(m_step_fused_kernel);
+    } else {
+        size_t n_k_bytes = static_cast<size_t>(K) * sizeof(T);
+        size_t num_bytes = static_cast<size_t>(K) * d * sizeof(T);
+        size_t cov_bytes = static_cast<size_t>(K) * d * d * sizeof(T);
+        cuda_check_runtime(cudaMemsetAsync(workspace_N_k, 0, n_k_bytes, stream),
+                           "cudaMemsetAsync(workspace_N_k)");
+        cuda_check_runtime(cudaMemsetAsync(workspace_num, 0, num_bytes, stream),
+                           "cudaMemsetAsync(workspace_num)");
+        cuda_check_runtime(cudaMemsetAsync(covariances, 0, cov_bytes, stream),
+                           "cudaMemsetAsync(covariances)");
+        int chunks = (n + M_STEP_CHUNK_ROWS - 1) / M_STEP_CHUNK_ROWS;
+        dim3 block(TILE, TILE);
+        dim3 grid(K, tiles_d * tiles_d, chunks);
+        m_step_chunked_atomic_kernel<T, TILE><<<grid, block, 0, stream>>>(
+            resp, X, n, d, K, tiles_d, M_STEP_CHUNK_ROWS, workspace_N_k,
+            workspace_num, covariances);
+        CUDA_CHECK_LAST_ERROR(m_step_chunked_atomic_kernel);
+    }
+    {
+        int threads = 256;
+        dim3 block(threads);
+        dim3 grid(K);
+        m_step_finalize_kernel<T><<<grid, block, 0, stream>>>(
+            workspace_N_k, workspace_num, covariances, weights, means,
+            reg_covar, eps, n, d, K);
+        CUDA_CHECK_LAST_ERROR(m_step_finalize_kernel);
+    }
+}
+
+template <typename T>
+static inline void launch_m_step_cublas(const T* resp, const T* X,
+                                        const T* ones, int n, int d, int K,
+                                        T reg_covar, T* weights, T* means,
+                                        T* covariances, T* workspace_N_k,
+                                        T* workspace_num, T* workspace_centered,
+                                        cudaStream_t stream) {
+    if (n == 0 || d == 0 || K == 0) return;
+
+    cublasHandle_t handle;
+    cublas_check_status(cublasCreate(&handle), "cublasCreate");
+    cublas_check_status(cublasSetStream(handle, stream), "cublasSetStream");
+
+    T one = T(1);
+    T zero = T(0);
+    T eps = std::numeric_limits<T>::epsilon();
+
+    // Row-major resp(n,K) is cuBLAS column-major (K,n). N_k = resp.T @ 1.
+    cublas_check_status(cublas_gemv<T>(handle, CUBLAS_OP_N, K, n, &one, resp, K,
+                                       ones, 1, &zero, workspace_N_k, 1),
+                        "cublas_gemv(N_k)");
+
+    // Row-major X(n,d) is cuBLAS column-major (d,n). Fill row-major
+    // workspace_num(K,d) through its column-major (d,K) view with X.T @ resp.
+    cublas_check_status(
+        cublas_gemm<T>(handle, CUBLAS_OP_N, CUBLAS_OP_T, d, K, n, &one, X, d,
+                       resp, K, &zero, workspace_num, d),
+        "cublas_gemm(num)");
+
+    {
+        int threads = 256;
+        dim3 block(threads);
+        dim3 grid(K);
+        m_step_finalize_means_kernel<T><<<grid, block, 0, stream>>>(
+            workspace_N_k, workspace_num, weights, means, eps, n, d, K);
+        CUDA_CHECK_LAST_ERROR(m_step_finalize_means_kernel);
+    }
+
+    {
+        int threads = 256;
+        int blocks = (int)(((size_t)n * d + threads - 1) / threads);
+        for (int k = 0; k < K; ++k) {
+            weighted_center_kernel<T><<<blocks, threads, 0, stream>>>(
+                X, resp, means, n, d, K, k, workspace_centered);
+            CUDA_CHECK_LAST_ERROR(weighted_center_kernel);
+
+            T* cov_k = covariances + (size_t)k * d * d;
+            cublas_check_status(
+                cublas_gemm<T>(handle, CUBLAS_OP_N, CUBLAS_OP_T, d, d, n, &one,
+                               workspace_centered, d, workspace_centered, d,
+                               &zero, cov_k, d),
+                "cublas_gemm(covariance)");
+        }
+    }
+
+    {
+        int threads = 256;
+        dim3 block(threads);
+        dim3 grid(K);
+        m_step_finalize_cov_cublas_kernel<T><<<grid, block, 0, stream>>>(
+            workspace_N_k, covariances, reg_covar, eps, d, K);
+        CUDA_CHECK_LAST_ERROR(m_step_finalize_cov_cublas_kernel);
+    }
+
+    cublas_check_status(cublasDestroy(handle), "cublasDestroy");
+}
+
+template <typename Device>
+void register_bindings(nb::module_& m) {
+    m.def(
+        "e_step",
+        [](gpu_array_c<const float, Device> X,
+           gpu_array_c<const float, Device> weights,
+           gpu_array_c<const float, Device> means,
+           gpu_array_c<const float, Device> prec_chol,
+           gpu_array_c<const float, Device> log_det_half,
+           gpu_array_c<float, Device> log_prob, gpu_array_c<float, Device> resp,
+           gpu_array_c<float, Device> ll_per_cell, int n, int d, int K,
+           std::uintptr_t stream) {
+            launch_e_step<float>(X.data(), weights.data(), means.data(),
+                                 prec_chol.data(), log_det_half.data(), n, d, K,
+                                 log_prob.data(), resp.data(),
+                                 ll_per_cell.data(), (cudaStream_t)stream);
+        },
+        "X"_a, "weights"_a, "means"_a, "prec_chol"_a, "log_det_half"_a,
+        "log_prob"_a, "resp"_a, "ll_per_cell"_a, nb::kw_only(), "n"_a, "d"_a,
+        "K"_a, "stream"_a = 0);
+
+    m.def(
+        "e_step",
+        [](gpu_array_c<const double, Device> X,
+           gpu_array_c<const double, Device> weights,
+           gpu_array_c<const double, Device> means,
+           gpu_array_c<const double, Device> prec_chol,
+           gpu_array_c<const double, Device> log_det_half,
+           gpu_array_c<double, Device> log_prob,
+           gpu_array_c<double, Device> resp,
+           gpu_array_c<double, Device> ll_per_cell, int n, int d, int K,
+           std::uintptr_t stream) {
+            launch_e_step<double>(X.data(), weights.data(), means.data(),
+                                  prec_chol.data(), log_det_half.data(), n, d,
+                                  K, log_prob.data(), resp.data(),
+                                  ll_per_cell.data(), (cudaStream_t)stream);
+        },
+        "X"_a, "weights"_a, "means"_a, "prec_chol"_a, "log_det_half"_a,
+        "log_prob"_a, "resp"_a, "ll_per_cell"_a, nb::kw_only(), "n"_a, "d"_a,
+        "K"_a, "stream"_a = 0);
+
+    m.def(
+        "m_step",
+        [](gpu_array_c<const float, Device> resp,
+           gpu_array_c<const float, Device> X,
+           gpu_array_c<float, Device> weights, gpu_array_c<float, Device> means,
+           gpu_array_c<float, Device> covariances,
+           gpu_array_c<float, Device> N_k_workspace,
+           gpu_array_c<float, Device> num_workspace, int n, int d, int K,
+           float reg_covar, std::uintptr_t stream) {
+            launch_m_step<float>(resp.data(), X.data(), n, d, K, reg_covar,
+                                 weights.data(), means.data(),
+                                 covariances.data(), N_k_workspace.data(),
+                                 num_workspace.data(), (cudaStream_t)stream);
+        },
+        "resp"_a, "X"_a, "weights"_a, "means"_a, "covariances"_a,
+        "N_k_workspace"_a, "num_workspace"_a, nb::kw_only(), "n"_a, "d"_a,
+        "K"_a, "reg_covar"_a, "stream"_a = 0);
+
+    m.def(
+        "m_step_cublas",
+        [](gpu_array_c<const float, Device> resp,
+           gpu_array_c<const float, Device> X,
+           gpu_array_c<const float, Device> ones,
+           gpu_array_c<float, Device> weights, gpu_array_c<float, Device> means,
+           gpu_array_c<float, Device> covariances,
+           gpu_array_c<float, Device> N_k_workspace,
+           gpu_array_c<float, Device> num_workspace,
+           gpu_array_c<float, Device> centered_workspace, int n, int d, int K,
+           float reg_covar, std::uintptr_t stream) {
+            launch_m_step_cublas<float>(
+                resp.data(), X.data(), ones.data(), n, d, K, reg_covar,
+                weights.data(), means.data(), covariances.data(),
+                N_k_workspace.data(), num_workspace.data(),
+                centered_workspace.data(), (cudaStream_t)stream);
+        },
+        "resp"_a, "X"_a, "ones"_a, "weights"_a, "means"_a, "covariances"_a,
+        "N_k_workspace"_a, "num_workspace"_a, "centered_workspace"_a,
+        nb::kw_only(), "n"_a, "d"_a, "K"_a, "reg_covar"_a, "stream"_a = 0);
+
+    m.def(
+        "m_step_cublas",
+        [](gpu_array_c<const double, Device> resp,
+           gpu_array_c<const double, Device> X,
+           gpu_array_c<const double, Device> ones,
+           gpu_array_c<double, Device> weights,
+           gpu_array_c<double, Device> means,
+           gpu_array_c<double, Device> covariances,
+           gpu_array_c<double, Device> N_k_workspace,
+           gpu_array_c<double, Device> num_workspace,
+           gpu_array_c<double, Device> centered_workspace, int n, int d, int K,
+           double reg_covar, std::uintptr_t stream) {
+            launch_m_step_cublas<double>(
+                resp.data(), X.data(), ones.data(), n, d, K, reg_covar,
+                weights.data(), means.data(), covariances.data(),
+                N_k_workspace.data(), num_workspace.data(),
+                centered_workspace.data(), (cudaStream_t)stream);
+        },
+        "resp"_a, "X"_a, "ones"_a, "weights"_a, "means"_a, "covariances"_a,
+        "N_k_workspace"_a, "num_workspace"_a, "centered_workspace"_a,
+        nb::kw_only(), "n"_a, "d"_a, "K"_a, "reg_covar"_a, "stream"_a = 0);
+
+    m.def(
+        "m_step",
+        [](gpu_array_c<const double, Device> resp,
+           gpu_array_c<const double, Device> X,
+           gpu_array_c<double, Device> weights,
+           gpu_array_c<double, Device> means,
+           gpu_array_c<double, Device> covariances,
+           gpu_array_c<double, Device> N_k_workspace,
+           gpu_array_c<double, Device> num_workspace, int n, int d, int K,
+           double reg_covar, std::uintptr_t stream) {
+            launch_m_step<double>(resp.data(), X.data(), n, d, K, reg_covar,
+                                  weights.data(), means.data(),
+                                  covariances.data(), N_k_workspace.data(),
+                                  num_workspace.data(), (cudaStream_t)stream);
+        },
+        "resp"_a, "X"_a, "weights"_a, "means"_a, "covariances"_a,
+        "N_k_workspace"_a, "num_workspace"_a, nb::kw_only(), "n"_a, "d"_a,
+        "K"_a, "reg_covar"_a, "stream"_a = 0);
+}
+
+NB_MODULE(_gmm_cuda, m) {
+    REGISTER_GPU_BINDINGS(register_bindings, m);
+}