From 7eabf64d218f5c0ab3c2c26c828503009bfefeda Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 13 Nov 2025 02:12:25 +0000 Subject: [PATCH 1/7] opt knn mem usage --- cpp/include/cuml/manifold/common.hpp | 21 ++++++++++++++++++++- cpp/src/umap/knn_graph/algo.cuh | 20 ++++++++++++++++---- python/cuml/cuml/common/sparsefuncs.py | 4 +++- python/cuml/cuml/manifold/umap/umap.pyx | 1 + 4 files changed, 40 insertions(+), 6 deletions(-) diff --git a/cpp/include/cuml/manifold/common.hpp b/cpp/include/cuml/manifold/common.hpp index cba8398d45..4ec0ac908c 100644 --- a/cpp/include/cuml/manifold/common.hpp +++ b/cpp/include/cuml/manifold/common.hpp @@ -5,6 +5,8 @@ #pragma once +#include + #include namespace ML { @@ -104,7 +106,24 @@ struct manifold_precomputed_knn_inputs_t : public manifold_inputs_t { knn_graph knn_graph; - bool alloc_knn_graph() const { return false; } + bool alloc_knn_graph() const + { + // Return true if data is on CPU (need to allocate device memory) + // Return false if data is already on device (no allocation needed) + auto check_is_device = [](const void* ptr) -> bool { + cudaPointerAttributes attr; + cudaError_t err = cudaPointerGetAttributes(&attr, ptr); + if (err != cudaSuccess) { + cudaGetLastError(); + return false; // Assume host pointer if query fails + } + return attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged; + }; + + bool indices_on_device = check_is_device(knn_graph.knn_indices); + bool dists_on_device = check_is_device(knn_graph.knn_dists); + return !(indices_on_device && dists_on_device); + } }; }; // end namespace ML diff --git a/cpp/src/umap/knn_graph/algo.cuh b/cpp/src/umap/knn_graph/algo.cuh index c43439fd4c..68e2a8bfd5 100644 --- a/cpp/src/umap/knn_graph/algo.cuh +++ b/cpp/src/umap/knn_graph/algo.cuh @@ -197,8 +197,14 @@ inline void launcher(const raft::handle_t& handle, const ML::UMAPParams* params, cudaStream_t stream) { - out.knn_indices = inputsA.knn_graph.knn_indices; - out.knn_dists = inputsA.knn_graph.knn_dists; + if (inputsA.alloc_knn_graph()) { + // if new space for the knn graph is allocated, copy the data from the precomputed knn graph + raft::copy(out.knn_indices, inputsA.knn_graph.knn_indices, inputsA.n * n_neighbors, stream); + raft::copy(out.knn_dists, inputsA.knn_graph.knn_dists, inputsA.n * n_neighbors, stream); + } else { + out.knn_indices = inputsA.knn_graph.knn_indices; + out.knn_dists = inputsA.knn_graph.knn_dists; + } } // Instantiation for precomputed inputs, int indices @@ -211,8 +217,14 @@ inline void launcher(const raft::handle_t& handle, const ML::UMAPParams* params, cudaStream_t stream) { - out.knn_indices = inputsA.knn_graph.knn_indices; - out.knn_dists = inputsA.knn_graph.knn_dists; + if (inputsA.alloc_knn_graph()) { + // if new space for the knn graph is allocated, copy the data from the precomputed knn graph + raft::copy(out.knn_indices, inputsA.knn_graph.knn_indices, inputsA.n * n_neighbors, stream); + raft::copy(out.knn_dists, inputsA.knn_graph.knn_dists, inputsA.n * n_neighbors, stream); + } else { + out.knn_indices = inputsA.knn_graph.knn_indices; + out.knn_dists = inputsA.knn_graph.knn_dists; + } } } // namespace Algo diff --git a/python/cuml/cuml/common/sparsefuncs.py b/python/cuml/cuml/common/sparsefuncs.py index 6a2cdf6fed..51724bd90c 100644 --- a/python/cuml/cuml/common/sparsefuncs.py +++ b/python/cuml/cuml/common/sparsefuncs.py @@ -263,7 +263,7 @@ def _determine_k_from_arrays( return total_elements // n_samples -def extract_knn_graph(knn_info, n_neighbors): +def extract_knn_graph(knn_info, n_neighbors, mem_type=None): """ Extract the nearest neighbors distances and indices from the knn_info parameter. @@ -367,6 +367,7 @@ def extract_knn_graph(knn_info, n_neighbors): deepcopy=deepcopy, check_dtype=np.int64, convert_to_dtype=np.int64, + convert_to_mem_type=mem_type, ) knn_dists_m, _, _, _ = input_to_cuml_array( @@ -375,6 +376,7 @@ def extract_knn_graph(knn_info, n_neighbors): deepcopy=deepcopy, check_dtype=np.float32, convert_to_dtype=np.float32, + convert_to_mem_type=mem_type, ) return knn_indices_m, knn_dists_m diff --git a/python/cuml/cuml/manifold/umap/umap.pyx b/python/cuml/cuml/manifold/umap/umap.pyx index 03cf80b565..c54a6f20c1 100644 --- a/python/cuml/cuml/manifold/umap/umap.pyx +++ b/python/cuml/cuml/manifold/umap/umap.pyx @@ -969,6 +969,7 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): knn_indices, knn_dists = extract_knn_graph( (knn_graph if knn_graph is not None else self.precomputed_knn), self._n_neighbors, + mem_type=False, # mirrors the input graph mem type ) if X_is_sparse: knn_indices = input_to_cuml_array( From 1b0a96e611f665c2b0bcab166de0714d02cce4d4 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 13 Nov 2025 02:48:59 +0000 Subject: [PATCH 2/7] docs --- python/cuml/cuml/manifold/umap/umap.pyx | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/python/cuml/cuml/manifold/umap/umap.pyx b/python/cuml/cuml/manifold/umap/umap.pyx index c54a6f20c1..d5c58b4930 100644 --- a/python/cuml/cuml/manifold/umap/umap.pyx +++ b/python/cuml/cuml/manifold/umap/umap.pyx @@ -522,7 +522,8 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): sparse array (preferably CSR/COO). This feature allows the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function. This function - should match the metric used to train the UMAP embeedings. + should match the metric used to train the UMAP embeedings. For efficient + memory usage, the precomputed knn graph should be given as CPU arrays. random_state : int, RandomState instance or None, optional (default=None) random_state is the seed used by the random number generator during embedding initialization and during sampling used by the optimizer. @@ -901,7 +902,8 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function. This function should match the metric used to train the UMAP embeedings. - Takes precedence over the precomputed_knn parameter. + Takes precedence over the precomputed_knn parameter. For efficient + memory usage, the precomputed knn graph should be given as CPU arrays. """ if len(X.shape) != 2: raise ValueError("Reshape your data: data should be two dimensional") @@ -1074,7 +1076,8 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function. This function should match the metric used to train the UMAP embeedings. - Takes precedence over the precomputed_knn parameter. + Takes precedence over the precomputed_knn parameter. For efficient + memory usage, the precomputed knn graph should be given as CPU arrays. """ self.fit(X, y, convert_dtype=convert_dtype, knn_graph=knn_graph) return self.embedding_ From a4dab3a15d2681d507cb51ce2c07c22b3b44445e Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 13 Nov 2025 02:49:47 +0000 Subject: [PATCH 3/7] tab --- python/cuml/cuml/manifold/umap/umap.pyx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cuml/cuml/manifold/umap/umap.pyx b/python/cuml/cuml/manifold/umap/umap.pyx index d5c58b4930..ce83bf41d6 100644 --- a/python/cuml/cuml/manifold/umap/umap.pyx +++ b/python/cuml/cuml/manifold/umap/umap.pyx @@ -903,7 +903,7 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): and also allows the use of a custom distance function. This function should match the metric used to train the UMAP embeedings. Takes precedence over the precomputed_knn parameter. For efficient - memory usage, the precomputed knn graph should be given as CPU arrays. + memory usage, the precomputed knn graph should be given as CPU arrays. """ if len(X.shape) != 2: raise ValueError("Reshape your data: data should be two dimensional") @@ -1077,7 +1077,7 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): and also allows the use of a custom distance function. This function should match the metric used to train the UMAP embeedings. Takes precedence over the precomputed_knn parameter. For efficient - memory usage, the precomputed knn graph should be given as CPU arrays. + memory usage, the precomputed knn graph should be given as CPU arrays. """ self.fit(X, y, convert_dtype=convert_dtype, knn_graph=knn_graph) return self.embedding_ From f1d7ba1ec4caa0a4c36b783e89ede7ca113332fd Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 13 Nov 2025 21:12:58 +0000 Subject: [PATCH 4/7] use check_pointer_residency func --- cpp/include/cuml/manifold/common.hpp | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/cpp/include/cuml/manifold/common.hpp b/cpp/include/cuml/manifold/common.hpp index 4ec0ac908c..d3cbd6c20f 100644 --- a/cpp/include/cuml/manifold/common.hpp +++ b/cpp/include/cuml/manifold/common.hpp @@ -5,6 +5,8 @@ #pragma once +#include + #include #include @@ -110,19 +112,10 @@ struct manifold_precomputed_knn_inputs_t : public manifold_inputs_t { { // Return true if data is on CPU (need to allocate device memory) // Return false if data is already on device (no allocation needed) - auto check_is_device = [](const void* ptr) -> bool { - cudaPointerAttributes attr; - cudaError_t err = cudaPointerGetAttributes(&attr, ptr); - if (err != cudaSuccess) { - cudaGetLastError(); - return false; // Assume host pointer if query fails - } - return attr.type == cudaMemoryTypeDevice || attr.type == cudaMemoryTypeManaged; - }; - - bool indices_on_device = check_is_device(knn_graph.knn_indices); - bool dists_on_device = check_is_device(knn_graph.knn_dists); - return !(indices_on_device && dists_on_device); + auto pointer_residency = raft::spatial::knn::detail::utils::check_pointer_residency( + knn_graph.knn_indices, knn_graph.knn_dists); + return pointer_residency == raft::spatial::knn::detail::utils::pointer_residency::host_only || + pointer_residency == raft::spatial::knn::detail::utils::pointer_residency::mixed; } }; From 1c13cffd84b0e4170425201a5b1c0331cc51239f Mon Sep 17 00:00:00 2001 From: jinsolp Date: Thu, 13 Nov 2025 21:14:52 +0000 Subject: [PATCH 5/7] rm cuda header --- cpp/include/cuml/manifold/common.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/include/cuml/manifold/common.hpp b/cpp/include/cuml/manifold/common.hpp index d3cbd6c20f..3d8ea3eefe 100644 --- a/cpp/include/cuml/manifold/common.hpp +++ b/cpp/include/cuml/manifold/common.hpp @@ -7,8 +7,6 @@ #include -#include - #include namespace ML { From 090d2f3c4abd81269b5aed6a0644cd4c95f01987 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 19 Nov 2025 19:36:53 +0000 Subject: [PATCH 6/7] update docs --- python/cuml/cuml/manifold/umap/umap.pyx | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/cuml/cuml/manifold/umap/umap.pyx b/python/cuml/cuml/manifold/umap/umap.pyx index ce83bf41d6..0e9333ed54 100644 --- a/python/cuml/cuml/manifold/umap/umap.pyx +++ b/python/cuml/cuml/manifold/umap/umap.pyx @@ -522,8 +522,9 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): sparse array (preferably CSR/COO). This feature allows the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function. This function - should match the metric used to train the UMAP embeedings. For efficient - memory usage, the precomputed knn graph should be given as CPU arrays. + should match the metric used to train the UMAP embeedings. For most efficient + memory usage, the precomputed knn graph should be CPU-accessible arrays + such as numpy arrays. random_state : int, RandomState instance or None, optional (default=None) random_state is the seed used by the random number generator during embedding initialization and during sampling used by the optimizer. @@ -902,8 +903,9 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function. This function should match the metric used to train the UMAP embeedings. - Takes precedence over the precomputed_knn parameter. For efficient - memory usage, the precomputed knn graph should be given as CPU arrays. + Takes precedence over the precomputed_knn parameter. For most efficient + memory usage, the precomputed knn graph should be CPU-accessible arrays + such as numpy arrays. """ if len(X.shape) != 2: raise ValueError("Reshape your data: data should be two dimensional") @@ -1076,8 +1078,9 @@ class UMAP(Base, InteropMixin, CMajorInputTagMixin, SparseInputTagMixin): the precomputation of the KNN outside of UMAP and also allows the use of a custom distance function. This function should match the metric used to train the UMAP embeedings. - Takes precedence over the precomputed_knn parameter. For efficient - memory usage, the precomputed knn graph should be given as CPU arrays. + Takes precedence over the precomputed_knn parameter. For most efficient + memory usage, the precomputed knn graph should be CPU-accessible arrays + such as numpy arrays. """ self.fit(X, y, convert_dtype=convert_dtype, knn_graph=knn_graph) return self.embedding_ From eae9fe6e36601ada7ef7491919bcf823a6bf6983 Mon Sep 17 00:00:00 2001 From: jinsolp Date: Wed, 19 Nov 2025 21:41:43 +0000 Subject: [PATCH 7/7] change default --- python/cuml/cuml/common/sparsefuncs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cuml/cuml/common/sparsefuncs.py b/python/cuml/cuml/common/sparsefuncs.py index 51724bd90c..e7c3fb5dd0 100644 --- a/python/cuml/cuml/common/sparsefuncs.py +++ b/python/cuml/cuml/common/sparsefuncs.py @@ -263,7 +263,7 @@ def _determine_k_from_arrays( return total_elements // n_samples -def extract_knn_graph(knn_info, n_neighbors, mem_type=None): +def extract_knn_graph(knn_info, n_neighbors, mem_type="device"): """ Extract the nearest neighbors distances and indices from the knn_info parameter.