diff --git a/cpp/include/cudf/detail/utilities/host_vector.hpp b/cpp/include/cudf/detail/utilities/host_vector.hpp index f06b864276b..40ae0a366b5 100644 --- a/cpp/include/cudf/detail/utilities/host_vector.hpp +++ b/cpp/include/cudf/detail/utilities/host_vector.hpp @@ -99,6 +99,9 @@ class rmm_host_allocator { /** * @brief Construct from a `cudf::host_async_resource_ref` */ +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif template rmm_host_allocator(async_host_resource_ref _mr, rmm::cuda_stream_view _stream) : mr(_mr), @@ -107,6 +110,26 @@ class rmm_host_allocator { { } +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + rmm_host_allocator(rmm_host_allocator const&) = default; + +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + rmm_host_allocator(rmm_host_allocator&&) = default; + +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + rmm_host_allocator& operator=(rmm_host_allocator const&) = default; + +#ifdef __CUDACC__ +#pragma nv_exec_check_disable +#endif + rmm_host_allocator& operator=(rmm_host_allocator&&) = default; + /** * @brief This method allocates storage for objects in host memory. * diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu index f246a9ef6d8..32e5cbc2386 100644 --- a/cpp/src/copying/contiguous_split.cu +++ b/cpp/src/copying/contiguous_split.cu @@ -2040,7 +2040,7 @@ struct contiguous_split_state { cudf::table_view const input; ///< The input table_view to operate on std::size_t const user_buffer_size; ///< The size of the user buffer for the chunked_pack case rmm::cuda_stream_view const stream; - std::optional mr; ///< The resource for any data returned + std::optional mr; ///< The resource for any data returned // this resource defaults to `mr` for the contiguous_split case, but it can be useful for the // `chunked_pack` case to allocate scratch/temp memory in a pool diff --git a/cpp/src/io/fst/dispatch_dfa.cuh b/cpp/src/io/fst/dispatch_dfa.cuh index 9fb4365199d..5841813a51a 100644 --- a/cpp/src/io/fst/dispatch_dfa.cuh +++ b/cpp/src/io/fst/dispatch_dfa.cuh @@ -14,6 +14,64 @@ namespace cudf::io::fst::detail { +/** + * @brief Alias temporaries to externally-allocated device storage (or simply return the amount of + * storage needed). + * + * This is a replacement for the removed `cub::AliasTemporaries` function. + * + * @param[in] d_temp_storage + * Device-accessible allocation of temporary storage. + * When nullptr, the required allocation size is written to @p temp_storage_bytes and no work is + * done. + * + * @param[in,out] temp_storage_bytes + * Size in bytes of @p d_temp_storage allocation + * + * @param[in,out] allocations + * Pointers to device allocations needed + * + * @param[in] allocation_sizes + * Sizes in bytes of device allocations needed + */ +template +cudaError_t AliasTemporaries(void* d_temp_storage, + size_t& temp_storage_bytes, + void* (&allocations)[ALLOCATIONS], + size_t const (&allocation_sizes)[ALLOCATIONS]) +{ + constexpr size_t ALIGN_BYTES = 256; + constexpr size_t ALIGN_MASK = ~(ALIGN_BYTES - 1); + + // Compute exclusive prefix sum over allocation requests + size_t allocation_offsets[ALLOCATIONS]; + size_t bytes_needed = 0; + for (int i = 0; i < ALLOCATIONS; ++i) { + size_t const allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK; + allocation_offsets[i] = bytes_needed; + bytes_needed += allocation_bytes; + } + bytes_needed += ALIGN_BYTES - 1; + + // Check if the caller is simply requesting the size of the storage allocation + if (!d_temp_storage) { + temp_storage_bytes = bytes_needed; + return cudaSuccess; + } + + // Check if enough storage provided + if (temp_storage_bytes < bytes_needed) { return CubDebug(cudaErrorInvalidValue); } + + // Alias + d_temp_storage = reinterpret_cast( + (reinterpret_cast(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK); + for (int i = 0; i < ALLOCATIONS; ++i) { + allocations[i] = static_cast(d_temp_storage) + allocation_offsets[i]; + } + + return cudaSuccess; +} + /** * @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters. * @@ -161,8 +219,8 @@ struct DispatchFSM : DeviceFSMPolicy { cudaError_t error; // Get PTX version - int ptx_version; - error = cub::PtxVersion(ptx_version); + int ptx_version = 0; + error = cub::PtxVersion(ptx_version); if (error != cudaSuccess) return error; // Create dispatch functor @@ -381,8 +439,7 @@ struct DispatchFSM : DeviceFSMPolicy { // Alias the temporary allocations from the single storage blob (or compute the necessary size // of the blob) - error = cub::detail::AliasTemporaries( - d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); + error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes); if (error != cudaSuccess) return error; // Return if the caller is simply requesting the size of the storage allocation diff --git a/cpp/src/sort/segmented_top_k.cu b/cpp/src/sort/segmented_top_k.cu index 1ab2de97b83..8035d267b26 100644 --- a/cpp/src/sort/segmented_top_k.cu +++ b/cpp/src/sort/segmented_top_k.cu @@ -20,6 +20,7 @@ #include +#include #include #include #include @@ -56,7 +57,7 @@ CUDF_KERNEL void resolve_segment_indices(device_span d_offsets, if (index == 0) { auto const segment_size = segment_end - segment_start; - auto const segment_index = thrust::distance(d_offsets.begin(), sitr) - 1; + auto const segment_index = cuda::std::distance(d_offsets.begin(), sitr) - 1; // segment is k or less elements d_segment_sizes[segment_index] = cuda::std::min(k, segment_size); }