Skip to content
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions cpp/include/cudf/detail/utilities/host_vector.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ class rmm_host_allocator {
/**
* @brief Construct from a `cudf::host_async_resource_ref`
*/
#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
template <class... Properties>
rmm_host_allocator(async_host_resource_ref<Properties...> _mr, rmm::cuda_stream_view _stream)
: mr(_mr),
Expand All @@ -107,6 +110,26 @@ class rmm_host_allocator {
{
}

#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
rmm_host_allocator(rmm_host_allocator const&) = default;

#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
rmm_host_allocator(rmm_host_allocator&&) = default;

#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
rmm_host_allocator& operator=(rmm_host_allocator const&) = default;

#ifdef __CUDACC__
#pragma nv_exec_check_disable
#endif
rmm_host_allocator& operator=(rmm_host_allocator&&) = default;

/**
* @brief This method allocates storage for objects in host memory.
*
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/copying/contiguous_split.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2041,7 +2041,7 @@ struct contiguous_split_state {
cudf::table_view const input; ///< The input table_view to operate on
std::size_t const user_buffer_size; ///< The size of the user buffer for the chunked_pack case
rmm::cuda_stream_view const stream;
std::optional<rmm::device_async_resource_ref const> mr; ///< The resource for any data returned
std::optional<rmm::device_async_resource_ref> mr; ///< The resource for any data returned

// this resource defaults to `mr` for the contiguous_split case, but it can be useful for the
// `chunked_pack` case to allocate scratch/temp memory in a pool
Expand Down
65 changes: 61 additions & 4 deletions cpp/src/io/fst/dispatch_dfa.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,64 @@

namespace cudf::io::fst::detail {

/**
* @brief Alias temporaries to externally-allocated device storage (or simply return the amount of
* storage needed).
*
* This is a replacement for the removed `cub::AliasTemporaries` function.
*
* @param[in] d_temp_storage
* Device-accessible allocation of temporary storage.
* When nullptr, the required allocation size is written to @p temp_storage_bytes and no work is
* done.
*
* @param[in,out] temp_storage_bytes
* Size in bytes of @p d_temp_storage allocation
*
* @param[in,out] allocations
* Pointers to device allocations needed
*
* @param[in] allocation_sizes
* Sizes in bytes of device allocations needed
*/
template <int ALLOCATIONS>
cudaError_t AliasTemporaries(void* d_temp_storage,
size_t& temp_storage_bytes,
void* (&allocations)[ALLOCATIONS],
const size_t (&allocation_sizes)[ALLOCATIONS])
{
constexpr size_t ALIGN_BYTES = 256;
constexpr size_t ALIGN_MASK = ~(ALIGN_BYTES - 1);

// Compute exclusive prefix sum over allocation requests
size_t allocation_offsets[ALLOCATIONS];
size_t bytes_needed = 0;
for (int i = 0; i < ALLOCATIONS; ++i) {
const size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const size_t allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;
size_t const allocation_bytes = (allocation_sizes[i] + ALIGN_BYTES - 1) & ALIGN_MASK;

allocation_offsets[i] = bytes_needed;
bytes_needed += allocation_bytes;
}
bytes_needed += ALIGN_BYTES - 1;

// Check if the caller is simply requesting the size of the storage allocation
if (!d_temp_storage) {
temp_storage_bytes = bytes_needed;
return cudaSuccess;
}

// Check if enough storage provided
if (temp_storage_bytes < bytes_needed) { return CubDebug(cudaErrorInvalidValue); }

// Alias
d_temp_storage = reinterpret_cast<void*>(
(reinterpret_cast<uintptr_t>(d_temp_storage) + ALIGN_BYTES - 1) & ALIGN_MASK);
for (int i = 0; i < ALLOCATIONS; ++i) {
allocations[i] = static_cast<char*>(d_temp_storage) + allocation_offsets[i];
}

return cudaSuccess;
}

/**
* @brief The tuning policy comprising all the architecture-specific compile-time tuning parameters.
*
Expand Down Expand Up @@ -161,8 +219,8 @@ struct DispatchFSM : DeviceFSMPolicy {
cudaError_t error;

// Get PTX version
int ptx_version;
error = cub::PtxVersion(ptx_version);
int ptx_version = 0;
error = cub::PtxVersion(ptx_version);
if (error != cudaSuccess) return error;

// Create dispatch functor
Expand Down Expand Up @@ -381,8 +439,7 @@ struct DispatchFSM : DeviceFSMPolicy {

// Alias the temporary allocations from the single storage blob (or compute the necessary size
// of the blob)
error = cub::detail::AliasTemporaries(
d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
error = AliasTemporaries(d_temp_storage, temp_storage_bytes, allocations, allocation_sizes);
if (error != cudaSuccess) return error;

// Return if the caller is simply requesting the size of the storage allocation
Expand Down
3 changes: 2 additions & 1 deletion cpp/src/sort/segmented_top_k.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

#include <rmm/cuda_stream_view.hpp>

#include <cuda/std/iterator>
#include <thrust/binary_search.h>
#include <thrust/execution_policy.h>
#include <thrust/remove.h>
Expand Down Expand Up @@ -56,7 +57,7 @@ CUDF_KERNEL void resolve_segment_indices(device_span<size_type const> d_offsets,

if (index == 0) {
auto const segment_size = segment_end - segment_start;
auto const segment_index = thrust::distance(d_offsets.begin(), sitr) - 1;
auto const segment_index = cuda::std::distance(d_offsets.begin(), sitr) - 1;
// segment is k or less elements
d_segment_sizes[segment_index] = cuda::std::min(k, segment_size);
}
Expand Down