rapidsai · rapids-bot · Nov 26, 2025 · Nov 25, 2025 · Nov 25, 2025 · Nov 25, 2025
@@ -99,6 +99,9 @@ class rmm_host_allocator {
   /**
    * @brief Construct from a `cudf::host_async_resource_ref`
    */
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
   template <class... Properties>
   rmm_host_allocator(async_host_resource_ref<Properties...> _mr, rmm::cuda_stream_view _stream)
     : mr(_mr),
@@ -107,6 +110,26 @@ class rmm_host_allocator {
   {
   }
 
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  rmm_host_allocator(rmm_host_allocator const&) = default;
+
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  rmm_host_allocator(rmm_host_allocator&&) = default;
+
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  rmm_host_allocator& operator=(rmm_host_allocator const&) = default;
+
+#ifdef __CUDACC__
+#pragma nv_exec_check_disable
+#endif
+  rmm_host_allocator& operator=(rmm_host_allocator&&) = default;
+
   /**
    * @brief This method allocates storage for objects in host memory.
    *

@@ -2041,7 +2041,7 @@ struct contiguous_split_state {
   cudf::table_view const input;        ///< The input table_view to operate on
   std::size_t const user_buffer_size;  ///< The size of the user buffer for the chunked_pack case
   rmm::cuda_stream_view const stream;
-  std::optional<rmm::device_async_resource_ref const> mr;  ///< The resource for any data returned
+  std::optional<rmm::device_async_resource_ref> mr;  ///< The resource for any data returned
 
   // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
   // `chunked_pack` case to allocate scratch/temp memory in a pool

@@ -14,6 +14,64 @@
 
 namespace cudf::io::fst::detail {
 
+/**
+ * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of
+ * storage needed).
+ *
+ * This is a replacement for the removed `cub::AliasTemporaries` function.
+ *
+ * @param[in] d_temp_storage
+ *   Device-accessible allocation of temporary storage.
+ *   When nullptr, the required allocation size is written to @p temp_storage_bytes and no work is
+ *   done.
+ *
+ * @param[in,out] temp_storage_bytes
+ *   Size in bytes of @p d_temp_storage allocation
+ *
+ * @param[in,out] allocations
+ *   Pointers to device allocations needed
+ *
+ * @param[in] allocation_sizes
+ *   Sizes in bytes of device allocations needed
+ */
+template <int ALLOCATIONS>
+cudaError_t AliasTemporaries(void* d_temp_storage,
+                             size_t& temp_storage_bytes,
+                             void* (&allocations)[ALLOCATIONS],
+                             const size_t (&allocation_sizes)[ALLOCATIONS])
+{
+  constexpr size_t ALIGN_BYTES = 256;
+  constexpr size_t ALIGN_MASK  = ~(ALIGN_BYTES - 1);
+
+  // Compute exclusive prefix sum over allocation requests
+  size_t allocation_offsets[ALLOCATIONS];
+  size_t bytes_needed = 0;
+  for (int i = 0; i < ALLOCATIONS; ++i) {
+    const size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-    const size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+    size_t const allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
-    const size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+    size_t const allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
+    allocation_offsets[i]         = bytes_needed;
+    bytes_needed += allocation_bytes;
+  }
+  bytes_needed += ALIGN_BYTES - 1;
+
+  // Check if the caller is simply requesting the size of the storage allocation
+  if (!d_temp_storage) {
+    temp_storage_bytes = bytes_needed;
+    return cudaSuccess;
+  }
+
+  // Check if enough storage provided
+  if (temp_storage_bytes < bytes_needed) { return CubDebug(cudaErrorInvalidValue); }
+
+  // Alias
+  d_temp_storage = reinterpret_cast<void*>(
+    (reinterpret_cast<uintptr_t>(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
+  for (int i = 0; i < ALLOCATIONS; ++i) {
+    allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
+  }
+
+  return cudaSuccess;
+}
+
 /**
  * @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters.
  *
@@ -161,8 +219,8 @@ struct DispatchFSM : DeviceFSMPolicy {
     cudaError_t error;
 
     // Get PTX version
-    int ptx_version;
-    error = cub::PtxVersion(ptx_version);
+    int ptx_version = 0;
+    error           = cub::PtxVersion(ptx_version);
     if (error != cudaSuccess) return error;
 
     // Create dispatch functor
@@ -381,8 +439,7 @@ struct DispatchFSM : DeviceFSMPolicy {
 
     // Alias the temporary allocations from the single storage blob (or compute the necessary size
     // of the blob)
-    error = cub::detail::AliasTemporaries(
-      d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
+    error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
     if (error != cudaSuccess) return error;
 
     // Return if the caller is simply requesting the size of the storage allocation

@@ -20,6 +20,7 @@
 
 #include <rmm/cuda_stream_view.hpp>
 
+#include <cuda/std/iterator>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/remove.h>
@@ -56,7 +57,7 @@ CUDF_KERNEL void resolve_segment_indices(device_span<size_type const> d_offsets,
 
   if (index == 0) {
     auto const segment_size  = segment_end - segment_start;
-    auto const segment_index = thrust::distance(d_offsets.begin(), sitr) - 1;
+    auto const segment_index = cuda::std::distance(d_offsets.begin(), sitr) - 1;
     // segment is k or less elements
     d_segment_sizes[segment_index] = cuda::std::min(k, segment_size);
   }