Improved memory efficiency in UMAP given precomputed knn graphs (#7481)

jinsolp · web-flow · commit 45e220deb9e2 · 2025-11-21T02:21:15.000Z
Closes #7143 This PR improves memory usage in UMAP when given a precomputed knn graph. Previously, a user-given knn graph will occupy GPU memory throughout the full UMAP pipeline even though it is not needed in later steps of UMAP. In this PR, if the user-given knn graph is on host memory, we keep it on host memory and copy to device at the cpp level to allow better memory management. ### This PR with precomputed knn graph on CPU <img width="808" height="313" alt="Screenshot 2025-11-12 at 7 00 33 PM" src="https://github.com/user-attachments/assets/6c752f62-a1b2-4fb1-a44d-d86ed468915b" /> ### Before with precomputed knn graph on CPU <img width="828" height="316" alt="Screenshot 2025-11-12 at 7 01 12 PM" src="https://github.com/user-attachments/assets/8237fdd4-e0bb-48f5-bc46-71878ce14b33" /> Authors: - Jinsol Park (https://github.com/jinsolp) Approvers: - Philip Hyunsu Cho (https://github.com/hcho3) - Simon Adorf (https://github.com/csadorf) - Tarang Jain (https://github.com/tarang-jain) URL: #7481
diff --git a/cpp/include/cuml/manifold/common.hpp b/cpp/include/cuml/manifold/common.hpp
@@ -5,6 +5,8 @@
 
 #pragma once
 
+#include <raft/spatial/knn/detail/ann_utils.cuh>
+
 #include <stdint.h>
 
 namespace ML {
@@ -104,7 +106,15 @@ struct manifold_precomputed_knn_inputs_t : public manifold_inputs_t<value_t> {
 
   knn_graph<value_idx, value_t> knn_graph;
 
-  bool alloc_knn_graph() const { return false; }
+  bool alloc_knn_graph() const
+  {
+    // Return true if data is on CPU (need to allocate device memory)
+    // Return false if data is already on device (no allocation needed)
+    auto pointer_residency = raft::spatial::knn::detail::utils::check_pointer_residency(
+      knn_graph.knn_indices, knn_graph.knn_dists);
+    return pointer_residency == raft::spatial::knn::detail::utils::pointer_residency::host_only ||
+           pointer_residency == raft::spatial::knn::detail::utils::pointer_residency::mixed;
+  }
 };
 
 };  // end namespace ML
diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh
@@ -197,8 +197,14 @@ inline void launcher(const raft::handle_t& handle,
                      const ML::UMAPParams* params,
                      cudaStream_t stream)
 {
-  out.knn_indices = inputsA.knn_graph.knn_indices;
-  out.knn_dists   = inputsA.knn_graph.knn_dists;
+  if (inputsA.alloc_knn_graph()) {
+    // if new space for the knn graph is allocated, copy the data from the precomputed knn graph
+    raft::copy(out.knn_indices, inputsA.knn_graph.knn_indices, inputsA.n * n_neighbors, stream);
+    raft::copy(out.knn_dists, inputsA.knn_graph.knn_dists, inputsA.n * n_neighbors, stream);
+  } else {
+    out.knn_indices = inputsA.knn_graph.knn_indices;
+    out.knn_dists   = inputsA.knn_graph.knn_dists;
+  }
 }
 
 // Instantiation for precomputed inputs, int indices
@@ -211,8 +217,14 @@ inline void launcher(const raft::handle_t& handle,
                      const ML::UMAPParams* params,
                      cudaStream_t stream)
 {
-  out.knn_indices = inputsA.knn_graph.knn_indices;
-  out.knn_dists   = inputsA.knn_graph.knn_dists;
+  if (inputsA.alloc_knn_graph()) {
+    // if new space for the knn graph is allocated, copy the data from the precomputed knn graph
+    raft::copy(out.knn_indices, inputsA.knn_graph.knn_indices, inputsA.n * n_neighbors, stream);
+    raft::copy(out.knn_dists, inputsA.knn_graph.knn_dists, inputsA.n * n_neighbors, stream);
+  } else {
+    out.knn_indices = inputsA.knn_graph.knn_indices;
+    out.knn_dists   = inputsA.knn_graph.knn_dists;
+  }
 }
 
 }  // namespace Algo
diff --git a/python/cuml/cuml/common/sparsefuncs.py b/python/cuml/cuml/common/sparsefuncs.py
@@ -263,7 +263,7 @@ def _determine_k_from_arrays(
     return total_elements // n_samples
 
 
-def extract_knn_graph(knn_info, n_neighbors):
+def extract_knn_graph(knn_info, n_neighbors, mem_type="device"):
     """
     Extract the nearest neighbors distances and indices
     from the knn_info parameter.
@@ -367,6 +367,7 @@ def extract_knn_graph(knn_info, n_neighbors):
         deepcopy=deepcopy,
         check_dtype=np.int64,
         convert_to_dtype=np.int64,
+        convert_to_mem_type=mem_type,
     )
 
     knn_dists_m, _, _, _ = input_to_cuml_array(
@@ -375,6 +376,7 @@ def extract_knn_graph(knn_info, n_neighbors):
         deepcopy=deepcopy,
         check_dtype=np.float32,
         convert_to_dtype=np.float32,
+        convert_to_mem_type=mem_type,
     )
 
     return knn_indices_m, knn_dists_m
diff --git a/python/cuml/cuml/manifold/umap/umap.pyx b/python/cuml/cuml/manifold/umap/umap.pyx
@@ -522,7 +522,9 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin):
         sparse array (preferably CSR/COO). This feature allows
         the precomputation of the KNN outside of UMAP
         and also allows the use of a custom distance function. This function
-        should match the metric used to train the UMAP embeedings.
+        should match the metric used to train the UMAP embeedings. For most efficient
+        memory usage, the precomputed knn graph should be CPU-accessible arrays
+        such as numpy arrays.
     random_state : int, RandomState instance or None, optional (default=None)
         random_state is the seed used by the random number generator during
         embedding initialization and during sampling used by the optimizer.
@@ -900,7 +902,9 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin):
             the precomputation of the KNN outside of UMAP
             and also allows the use of a custom distance function. This function
             should match the metric used to train the UMAP embeedings.
-            Takes precedence over the precomputed_knn parameter.
+            Takes precedence over the precomputed_knn parameter. For most efficient
+            memory usage, the precomputed knn graph should be CPU-accessible arrays
+            such as numpy arrays.
         """
         if len(X.shape) != 2:
             raise ValueError("Reshape your data: data should be two dimensional")
@@ -968,6 +972,7 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin):
             knn_indices, knn_dists = extract_knn_graph(
                 (knn_graph if knn_graph is not None else self.precomputed_knn),
                 self._n_neighbors,
+                mem_type=False,     # mirrors the input graph mem type
             )
             if X_is_sparse:
                 knn_indices = input_to_cuml_array(
@@ -1072,7 +1077,9 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin):
             the precomputation of the KNN outside of UMAP
             and also allows the use of a custom distance function. This function
             should match the metric used to train the UMAP embeedings.
-            Takes precedence over the precomputed_knn parameter.
+            Takes precedence over the precomputed_knn parameter. For most efficient
+            memory usage, the precomputed knn graph should be CPU-accessible arrays
+            such as numpy arrays.
         """
         self.fit(X, y, convert_dtype=convert_dtype, knn_graph=knn_graph)
         return self.embedding_