Skip to content

Commit 4832834

Browse files
authored
batched_memset to use a host_span arg instead of std::vector (#19020)
Related to #18968 (comment) This PR updates the `batched_memset` cuIO utility to take in a `host_span` type argument instead of a `std::vector` to allow using `cudf::host_vectors` or `cudf::pinned_vectors` in the future as input. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - David Wendt (https://github.com/davidwendt) - Bradley Dice (https://github.com/bdice) URL: #19020
1 parent 9f92842 commit 4832834

File tree

4 files changed

+56
-52
lines changed

4 files changed

+56
-52
lines changed

cpp/include/cudf/detail/utilities/batched_memcpy.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,8 @@ namespace CUDF_EXPORT cudf {
3030
namespace detail {
3131

3232
/**
33-
* @brief A helper function that copies a vector of vectors from source to destination addresses in
34-
* a batched manner.
33+
* @brief Helper to batched memcpy specified numbers of bytes from source device iterators to
34+
* destination device iterators
3535
*
3636
* @tparam SrcIterator **[inferred]** The type of device-accessible source addresses iterator
3737
* @tparam DstIterator **[inferred]** The type of device-accessible destination address iterator

cpp/include/cudf/detail/utilities/batched_memset.hpp

Lines changed: 20 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -33,51 +33,50 @@ namespace CUDF_EXPORT cudf {
3333
namespace detail {
3434

3535
/**
36-
* @brief A helper function that takes in a vector of device spans and memsets them to the
37-
* value provided using batches sent to the GPU.
36+
* @brief Helper to batched memset a host span of device spans to the provided value
3837
*
39-
* @param bufs Vector with device spans of data
38+
* @param host_buffers Host span of device spans of data
4039
* @param value Value to memset all device spans to
41-
* @param _stream Stream used for device memory operations and kernel launches
40+
* @param stream Stream used for device memory operations and kernel launches
4241
*
4342
* @return The data in device spans all set to value
4443
*/
4544
template <typename T>
46-
void batched_memset(std::vector<cudf::device_span<T>> const& bufs,
45+
void batched_memset(cudf::host_span<cudf::device_span<T> const> host_buffers,
4746
T const value,
4847
rmm::cuda_stream_view stream)
4948
{
50-
// define task and bytes parameters
51-
auto const num_bufs = bufs.size();
49+
// Copy buffer spans into device memory and then get sizes
50+
auto buffers = cudf::detail::make_device_uvector_async(
51+
host_buffers, stream, cudf::get_current_device_resource_ref());
5252

53-
// copy bufs into device memory and then get sizes
54-
auto gpu_bufs =
55-
cudf::detail::make_device_uvector_async(bufs, stream, cudf::get_current_device_resource_ref());
56-
57-
// get a vector with the sizes of all buffers
53+
// Vector of sizes of all buffer spans
5854
auto sizes = thrust::make_transform_iterator(
59-
thrust::counting_iterator<std::size_t>(0),
60-
cuda::proclaim_return_type<std::size_t>(
61-
[gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].size(); }));
55+
thrust::counting_iterator<size_t>(0),
56+
cuda::proclaim_return_type<size_t>(
57+
[buffers = buffers.data()] __device__(size_t i) { return buffers[i].size(); }));
6258

63-
// get an iterator with a constant value to memset
59+
// Constant iterator to the value to memset
6460
auto iter_in = thrust::make_constant_iterator(thrust::make_constant_iterator(value));
6561

66-
// get an iterator pointing to each device span
62+
// Iterator to each device span pointer
6763
auto iter_out = thrust::make_transform_iterator(
68-
thrust::counting_iterator<std::size_t>(0),
64+
thrust::counting_iterator<size_t>(0),
6965
cuda::proclaim_return_type<T*>(
70-
[gpu_bufs = gpu_bufs.data()] __device__(std::size_t i) { return gpu_bufs[i].data(); }));
66+
[buffers = buffers.data()] __device__(size_t i) { return buffers[i].data(); }));
7167

7268
size_t temp_storage_bytes = 0;
69+
auto const num_buffers = host_buffers.size();
7370

74-
cub::DeviceCopy::Batched(nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
71+
cub::DeviceCopy::Batched(
72+
nullptr, temp_storage_bytes, iter_in, iter_out, sizes, num_buffers, stream);
7573

74+
// Allocate temporary storage
7675
rmm::device_buffer d_temp_storage(
7776
temp_storage_bytes, stream, cudf::get_current_device_resource_ref());
7877

7978
cub::DeviceCopy::Batched(
80-
d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_bufs, stream);
79+
d_temp_storage.data(), temp_storage_bytes, iter_in, iter_out, sizes, num_buffers, stream);
8180
}
8281

8382
} // namespace detail

cpp/src/io/parquet/reader_impl_preprocess.cu

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1577,7 +1577,7 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
15771577
// if we have any list columns that need further processing.
15781578
bool has_lists = false;
15791579
// Casting to std::byte since data buffer pointer is void *
1580-
std::vector<cudf::device_span<std::byte>> memset_bufs;
1580+
std::vector<cudf::device_span<cuda::std::byte>> memset_bufs;
15811581
// Validity Buffer is a uint32_t pointer
15821582
std::vector<cudf::device_span<cudf::bitmask_type>> nullmask_bufs;
15831583

@@ -1605,7 +1605,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
16051605
std::overflow_error);
16061606
out_buf.create_with_mask(
16071607
out_buf_size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr);
1608-
memset_bufs.emplace_back(static_cast<std::byte*>(out_buf.data()), out_buf.data_size());
1608+
memset_bufs.emplace_back(static_cast<cuda::std::byte*>(out_buf.data()),
1609+
out_buf.data_size());
16091610
nullmask_bufs.emplace_back(
16101611
out_buf.null_mask(),
16111612
cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
@@ -1720,7 +1721,8 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
17201721
// we're going to start null mask as all valid and then turn bits off if necessary
17211722
out_buf.create_with_mask(
17221723
buffer_size, cudf::mask_state::UNINITIALIZED, false, _stream, _mr);
1723-
memset_bufs.emplace_back(static_cast<std::byte*>(out_buf.data()), out_buf.data_size());
1724+
memset_bufs.emplace_back(static_cast<cuda::std::byte*>(out_buf.data()),
1725+
out_buf.data_size());
17241726
nullmask_bufs.emplace_back(
17251727
out_buf.null_mask(),
17261728
cudf::util::round_up_safe(out_buf.null_mask_size(), sizeof(cudf::bitmask_type)) /
@@ -1730,9 +1732,10 @@ void reader::impl::allocate_columns(read_mode mode, size_t skip_rows, size_t num
17301732
}
17311733
}
17321734

1733-
cudf::detail::batched_memset(memset_bufs, static_cast<std::byte>(0), _stream);
1735+
cudf::detail::batched_memset<cuda::std::byte>(
1736+
memset_bufs, static_cast<cuda::std::byte>(0), _stream);
17341737
// Need to set null mask bufs to all high bits
1735-
cudf::detail::batched_memset(
1738+
cudf::detail::batched_memset<cudf::bitmask_type>(
17361739
nullmask_bufs, std::numeric_limits<cudf::bitmask_type>::max(), _stream);
17371740
}
17381741

cpp/tests/utilities_tests/batched_memset_tests.cu

Lines changed: 26 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -37,7 +37,7 @@ struct MultiBufferTestIntegral : public cudf::test::BaseFixture {};
3737

3838
TEST(MultiBufferTestIntegral, BasicTest1)
3939
{
40-
std::vector<size_t> const BUF_SIZES{
40+
std::vector<size_t> const buffer_sizes{
4141
50000, 4, 1000, 0, 250000, 1, 100, 8000, 0, 1, 100, 1000, 10000, 100000, 0, 1, 100000};
4242

4343
// Device init
@@ -46,52 +46,54 @@ TEST(MultiBufferTestIntegral, BasicTest1)
4646

4747
// Creating base vector for data and setting it to all 0xFF
4848
std::vector<std::vector<uint64_t>> expected;
49-
std::transform(BUF_SIZES.begin(), BUF_SIZES.end(), std::back_inserter(expected), [](auto size) {
50-
return std::vector<uint64_t>(size + 2000, std::numeric_limits<uint64_t>::max());
51-
});
49+
std::transform(
50+
buffer_sizes.begin(), buffer_sizes.end(), std::back_inserter(expected), [](auto size) {
51+
return std::vector<uint64_t>(size + 2000, std::numeric_limits<uint64_t>::max());
52+
});
5253

5354
// set buffer region to other value
54-
std::for_each(thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
55-
thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
56-
[](auto elem) {
57-
std::fill_n(
58-
thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0xEEEEEEEEEEEEEEEE);
59-
});
55+
std::for_each(
56+
thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), buffer_sizes.begin())),
57+
thrust::make_zip_iterator(thrust::make_tuple(expected.end(), buffer_sizes.end())),
58+
[](auto elem) {
59+
std::fill_n(thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0xEEEEEEEEEEEEEEEE);
60+
});
6061

6162
// Copy host vector data to device
62-
std::vector<rmm::device_uvector<uint64_t>> device_bufs;
63+
std::vector<rmm::device_uvector<uint64_t>> device_buffers;
6364
std::transform(expected.begin(),
6465
expected.end(),
65-
std::back_inserter(device_bufs),
66+
std::back_inserter(device_buffers),
6667
[stream, mr](auto const& vec) {
6768
return cudf::detail::make_device_uvector_async(vec, stream, mr);
6869
});
6970

7071
// Initialize device buffers for memset
71-
std::vector<cudf::device_span<uint64_t>> memset_bufs;
72+
auto buffers =
73+
cudf::detail::make_host_vector<cudf::device_span<uint64_t>>(device_buffers.size(), stream);
7274
std::transform(
73-
thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), BUF_SIZES.begin())),
74-
thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), BUF_SIZES.end())),
75-
std::back_inserter(memset_bufs),
75+
thrust::make_zip_iterator(thrust::make_tuple(device_buffers.begin(), buffer_sizes.begin())),
76+
thrust::make_zip_iterator(thrust::make_tuple(device_buffers.end(), buffer_sizes.end())),
77+
buffers.begin(),
7678
[](auto const& elem) {
7779
return cudf::device_span<uint64_t>(thrust::get<0>(elem).data() + 1000, thrust::get<1>(elem));
7880
});
7981

80-
// Function Call
81-
cudf::detail::batched_memset(memset_bufs, uint64_t{0}, stream);
82+
// Function call
83+
cudf::detail::batched_memset<uint64_t>(buffers, uint64_t{0}, stream);
8284

8385
// Set all buffer regions to 0 for expected comparison
8486
std::for_each(
85-
thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), BUF_SIZES.begin())),
86-
thrust::make_zip_iterator(thrust::make_tuple(expected.end(), BUF_SIZES.end())),
87+
thrust::make_zip_iterator(thrust::make_tuple(expected.begin(), buffer_sizes.begin())),
88+
thrust::make_zip_iterator(thrust::make_tuple(expected.end(), buffer_sizes.end())),
8789
[](auto elem) { std::fill_n(thrust::get<0>(elem).begin() + 1000, thrust::get<1>(elem), 0UL); });
8890

8991
// Compare to see that only given buffers are zeroed out
9092
std::for_each(
91-
thrust::make_zip_iterator(thrust::make_tuple(device_bufs.begin(), expected.begin())),
92-
thrust::make_zip_iterator(thrust::make_tuple(device_bufs.end(), expected.end())),
93+
thrust::make_zip_iterator(thrust::make_tuple(device_buffers.begin(), expected.begin())),
94+
thrust::make_zip_iterator(thrust::make_tuple(device_buffers.end(), expected.end())),
9395
[stream](auto const& elem) {
94-
auto after_memset = cudf::detail::make_std_vector_async(thrust::get<0>(elem), stream);
96+
auto const after_memset = cudf::detail::make_host_vector(thrust::get<0>(elem), stream);
9597
EXPECT_TRUE(
9698
std::equal(thrust::get<1>(elem).begin(), thrust::get<1>(elem).end(), after_memset.begin()));
9799
});

0 commit comments

Comments
 (0)