Skip to content

Commit e9b04f7

Browse files
committed
More fixes
1 parent 48ae8be commit e9b04f7

File tree

15 files changed

+123
-127
lines changed

15 files changed

+123
-127
lines changed

include/cuco/detail/bloom_filter/arrow_filter_policy.cuh

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace cuco::detail {
3636
*
3737
* Example:
3838
* @code{.cpp}
39-
* template <typename KeyType, ::int NUM_FILTER_BLOCKS>
39+
* template <typename KeyType, std::uint32_t NUM_FILTER_BLOCKS>
4040
* void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector<KeyType> const& positive_keys,
4141
* device_vector<KeyType> const& negative_keys)
4242
* {
@@ -84,20 +84,18 @@ template <class Key, template <typename> class XXHash64>
8484
class arrow_filter_policy {
8585
public:
8686
using hasher = XXHash64<Key>; ///< 64-bit XXHash hasher for Arrow bloom filter policy
87-
using word_type = cuda::std::uint32_t; ///< uint32_t for Arrow bloom filter policy
88-
using key_type = Key; ///< Hash function input type
89-
using hash_result_type = cuda::std::uint64_t; ///< hash function output type
87+
using word_type = std::uint32_t; ///< uint32_t for Arrow bloom filter policy
88+
using key_type = Key; ///< Hash function input type
89+
using hash_result_type = std::uint64_t; ///< hash function output type
9090

91-
static constexpr cuda::std::int32_t bits_set_per_block =
92-
8; ///< hardcoded bits set per Arrow filter block
93-
static constexpr cuda::std::int32_t words_per_block =
94-
8; ///< hardcoded words per Arrow filter block
91+
static constexpr uint32_t bits_set_per_block = 8; ///< hardcoded bits set per Arrow filter block
92+
static constexpr uint32_t words_per_block = 8; ///< hardcoded words per Arrow filter block
9593

96-
static constexpr cuda::std::int32_t bytes_per_filter_block =
94+
static constexpr std::uint32_t bytes_per_filter_block =
9795
32; ///< Number of bytes in one Arrow filter block
98-
static constexpr cuda::std::int32_t max_arrow_filter_bytes =
96+
static constexpr std::uint32_t max_arrow_filter_bytes =
9997
128 * 1024 * 1024; ///< Max bytes in Arrow bloom filter
100-
static constexpr cuda::std::int32_t max_filter_blocks =
98+
static constexpr std::uint32_t max_filter_blocks =
10199
(max_arrow_filter_bytes /
102100
bytes_per_filter_block); ///< Max sub-filter blocks allowed in Arrow bloom filter
103101

@@ -155,11 +153,10 @@ class arrow_filter_policy {
155153
*
156154
* @return The bit pattern for the word/segment in the filter block
157155
*/
158-
__device__ constexpr word_type word_pattern(hash_result_type hash,
159-
cuda::std::int32_t word_index) const
156+
__device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
160157
{
161158
word_type const key = static_cast<word_type>(hash);
162-
cuda::std::int32_t salt;
159+
std::uint32_t salt;
163160

164161
// Basically a switch (word_index) { case 0-7 ... }
165162
// First split: 0..3 versus 4..7.
@@ -189,4 +186,4 @@ class arrow_filter_policy {
189186
hasher hash_;
190187
};
191188

192-
} // namespace cuco::detail
189+
} // namespace cuco::detail

include/cuco/detail/bloom_filter/bloom_filter_impl.cuh

Lines changed: 12 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,6 @@
2929
#include <cuda/std/__algorithm/min.h> // TODO #include <cuda/std/algorithm> once available
3030
#include <cuda/std/array>
3131
#include <cuda/std/bit>
32-
#include <cuda/std/cstdint>
3332
#include <cuda/std/functional>
3433
#include <cuda/std/tuple>
3534
#include <cuda/std/type_traits>
@@ -38,6 +37,8 @@
3837

3938
#include <cooperative_groups.h>
4039

40+
#include <cstdint>
41+
4142
namespace cuco::detail {
4243

4344
template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
@@ -137,7 +138,7 @@ class bloom_filter_impl {
137138
__device__ void add_impl(HashValue const& hash_value, BlockIndex block_index)
138139
{
139140
#pragma unroll words_per_block
140-
for (cuda::std::int32_t i = 0; i < words_per_block; ++i) {
141+
for (uint32_t i = 0; i < words_per_block; ++i) {
141142
auto const word = policy_.word_pattern(hash_value, i);
142143
if (word != 0) {
143144
auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
@@ -199,7 +200,7 @@ class bloom_filter_impl {
199200
block_index = policy_.block_index(hash_value, num_blocks_);
200201
}
201202

202-
for (cuda::std::int32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) {
203+
for (uint32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) {
203204
this->add_impl(group, group.shfl(hash_value, j), group.shfl(block_index, j));
204205
}
205206
}
@@ -219,9 +220,7 @@ class bloom_filter_impl {
219220
block_index = policy_.block_index(hash_value, num_blocks_);
220221
}
221222

222-
for (cuda::std::int32_t j = 0;
223-
(j < worker_num_threads) and (i + worker_offset + j < num_keys);
224-
++j) {
223+
for (uint32_t j = 0; (j < worker_num_threads) and (i + worker_offset + j < num_keys); ++j) {
225224
this->add_impl(
226225
worker_group, worker_group.shfl(hash_value, j), worker_group.shfl(block_index, j));
227226
}
@@ -319,7 +318,7 @@ class bloom_filter_impl {
319318
policy_.block_index(hash_value, num_blocks_) * words_per_block);
320319

321320
#pragma unroll words_per_block
322-
for (cuda::std::int32_t i = 0; i < words_per_block; ++i) {
321+
for (uint32_t i = 0; i < words_per_block; ++i) {
323322
auto const expected_pattern = policy_.word_pattern(hash_value, i);
324323
if ((stored_pattern[i] & expected_pattern) != expected_pattern) { return false; }
325324
}
@@ -343,12 +342,12 @@ class bloom_filter_impl {
343342
bool success = true;
344343

345344
#pragma unroll
346-
for (cuda::std::int32_t i = rank; i < optimal_num_threads; i += num_threads) {
345+
for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
347346
auto const thread_offset = i * words_per_thread;
348347
auto const stored_pattern = this->vec_load_words<words_per_thread>(
349348
policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
350349
#pragma unroll words_per_thread
351-
for (cuda::std::int32_t j = 0; j < words_per_thread; ++j) {
350+
for (uint32_t j = 0; j < words_per_thread; ++j) {
352351
auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
353352
if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
354353
}
@@ -431,25 +430,25 @@ class bloom_filter_impl {
431430
// TODO
432431
// [[nodiscard]] __host__ double occupancy() const;
433432
// [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
434-
// [[nodiscard]] __host__ __device__ static int32_t optimal_pattern_bits(size_t num_blocks)
433+
// [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
435434
// template <typename CG, cuda::thread_scope NewScope = thread_scope>
436435
// [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const
437436
// memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
438437

439438
private:
440-
template <::int NumWords>
439+
template <uint32_t NumWords>
441440
__device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
442441
{
443442
return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
444443
words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes())));
445444
}
446445

447-
[[nodiscard]] __host__ __device__ static constexpr cuda::std::int32_t add_optimal_cg_size()
446+
[[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
448447
{
449448
return words_per_block; // one thread per word so atomic updates can be coalesced
450449
}
451450

452-
[[nodiscard]] __host__ __device__ static constexpr cuda::std::int32_t contains_optimal_cg_size()
451+
[[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size()
453452
{
454453
constexpr auto word_bytes = sizeof(word_type);
455454
constexpr auto block_bytes = word_bytes * words_per_block;

include/cuco/detail/bloom_filter/default_filter_policy.inl

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -16,27 +16,27 @@
1616

1717
#pragma once
1818

19-
#include <cuda/std/cstdint>
19+
#include <cstdint>
2020

2121
namespace cuco {
2222

23-
template <class Hash, class Word, ::int WordsPerBlock>
23+
template <class Hash, class Word, uint32_t WordsPerBlock>
2424
__host__
2525
__device__ constexpr default_filter_policy<Hash, Word, WordsPerBlock>::default_filter_policy(
26-
cuda::std::int32_t pattern_bits, Hash hash)
26+
uint32_t pattern_bits, Hash hash)
2727
: impl_{pattern_bits, hash}
2828
{
2929
}
3030

31-
template <class Hash, class Word, ::int WordsPerBlock>
31+
template <class Hash, class Word, uint32_t WordsPerBlock>
3232
__device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type
3333
default_filter_policy<Hash, Word, WordsPerBlock>::hash(
3434
typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_argument_type const& key) const
3535
{
3636
return impl_.hash(key);
3737
}
3838

39-
template <class Hash, class Word, ::int WordsPerBlock>
39+
template <class Hash, class Word, uint32_t WordsPerBlock>
4040
template <class Extent>
4141
__device__ constexpr auto default_filter_policy<Hash, Word, WordsPerBlock>::block_index(
4242
typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
@@ -45,13 +45,13 @@ __device__ constexpr auto default_filter_policy<Hash, Word, WordsPerBlock>::bloc
4545
return impl_.block_index(hash, num_blocks);
4646
}
4747

48-
template <class Hash, class Word, ::int WordsPerBlock>
48+
template <class Hash, class Word, uint32_t WordsPerBlock>
4949
__device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::word_type
5050
default_filter_policy<Hash, Word, WordsPerBlock>::word_pattern(
5151
default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
52-
cuda::std::int32_t word_index) const
52+
std::uint32_t word_index) const
5353
{
5454
return impl_.word_pattern(hash, word_index);
5555
}
5656

57-
} // namespace cuco
57+
} // namespace cuco

include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh

Lines changed: 18 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,31 +19,31 @@
1919
#include <cuco/detail/error.hpp>
2020

2121
#include <cuda/std/bit>
22-
#include <cuda/std/cstdint>
2322
#include <cuda/std/limits>
2423
#include <cuda/std/tuple>
2524
#include <cuda/std/type_traits>
2625

26+
#include <cstdint>
2727
#include <nv/target>
2828

2929
namespace cuco::detail {
3030

31-
template <class Hash, class Word, ::int WordsPerBlock>
31+
template <class Hash, class Word, uint32_t WordsPerBlock>
3232
class default_filter_policy_impl {
3333
public:
3434
using hasher = Hash;
3535
using word_type = Word;
3636
using hash_argument_type = typename hasher::argument_type;
3737
using hash_result_type = decltype(std::declval<hasher>()(std::declval<hash_argument_type>()));
3838

39-
static constexpr cuda::std::int32_t words_per_block = WordsPerBlock;
39+
static constexpr std::uint32_t words_per_block = WordsPerBlock;
4040

4141
private:
42-
static constexpr cuda::std::int32_t word_bits = cuda::std::numeric_limits<word_type>::digits;
43-
static constexpr cuda::std::int32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
42+
static constexpr std::uint32_t word_bits = cuda::std::numeric_limits<word_type>::digits;
43+
static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
4444

4545
public:
46-
__host__ __device__ explicit constexpr default_filter_policy_impl(cuda::std::int32_t pattern_bits,
46+
__host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits,
4747
Hash hash)
4848
: pattern_bits_{pattern_bits},
4949
min_bits_per_word_{pattern_bits_ / words_per_block},
@@ -54,14 +54,14 @@ class default_filter_policy_impl {
5454
NV_IS_HOST,
5555
( // This ensures each word in the block has at least one bit set; otherwise we would never
5656
// use some of the words
57-
constexpr cuda::int32_t min_pattern_bits = words_per_block;
57+
constexpr uint32_t min_pattern_bits = words_per_block;
5858

5959
// The maximum number of bits to be set for a key is capped by the total number of bits in
6060
// the filter block
61-
constexpr cuda::int32_t max_pattern_bits = word_bits * words_per_block;
61+
constexpr uint32_t max_pattern_bits = word_bits * words_per_block;
6262

63-
constexpr cuda::int32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
64-
constexpr cuda::int32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
63+
constexpr uint32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
64+
constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
6565
CUCO_EXPECTS(
6666
pattern_bits <= max_pattern_bits_from_hash,
6767
"`hash_result_type` too narrow to generate the requested number of `pattern_bits`");
@@ -85,8 +85,7 @@ class default_filter_policy_impl {
8585
return hash % num_blocks;
8686
}
8787

88-
__device__ constexpr word_type word_pattern(hash_result_type hash,
89-
cuda::std::int32_t word_index) const
88+
__device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
9089
{
9190
word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1;
9291

@@ -95,11 +94,10 @@ class default_filter_policy_impl {
9594

9695
hash >>= bits_so_far * bit_index_width;
9796

98-
word_type word = 0;
99-
cuda::std::int32_t const bits_per_word =
100-
min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
97+
word_type word = 0;
98+
int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
10199

102-
for (cuda::std::int32_t bit = 0; bit < bits_per_word; ++bit) {
100+
for (int32_t bit = 0; bit < bits_per_word; ++bit) {
103101
word |= word_type{1} << (hash & bit_index_mask);
104102
hash >>= bit_index_width;
105103
}
@@ -108,10 +106,10 @@ class default_filter_policy_impl {
108106
}
109107

110108
private:
111-
cuda::std::int32_t pattern_bits_;
112-
cuda::std::int32_t min_bits_per_word_;
113-
cuda::std::int32_t remainder_bits_;
109+
uint32_t pattern_bits_;
110+
uint32_t min_bits_per_word_;
111+
uint32_t remainder_bits_;
114112
hasher hash_;
115113
};
116114

117-
} // namespace cuco::detail
115+
} // namespace cuco::detail

include/cuco/detail/bloom_filter/kernels.cuh

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,13 @@
2121

2222
#include <cooperative_groups.h>
2323

24+
#include <cstdint>
25+
2426
namespace cuco::detail::bloom_filter_ns {
2527

2628
CUCO_SUPPRESS_KERNEL_WARNINGS
2729

28-
template <::int BlockSize, class InputIt, class Ref>
30+
template <int32_t BlockSize, class InputIt, class Ref>
2931
CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
3032
cuco::detail::index_type n,
3133
Ref ref)
@@ -47,7 +49,12 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
4749
ref.add(tile, first + tile_start, first + tile_stop);
4850
}
4951

50-
template <::int CGSize, ::int BlockSize, class InputIt, class StencilIt, class Predicate, class Ref>
52+
template <int32_t CGSize,
53+
int32_t BlockSize,
54+
class InputIt,
55+
class StencilIt,
56+
class Predicate,
57+
class Ref>
5158
CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
5259
InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
5360
{
@@ -68,8 +75,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
6875
}
6976
}
7077

71-
template <::int CGSize,
72-
::int BlockSize,
78+
template <int32_t CGSize,
79+
int32_t BlockSize,
7380
class InputIt,
7481
class StencilIt,
7582
class Predicate,

include/cuco/detail/dynamic_map.inl

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,8 @@ dynamic_map<Key, Value, Scope, Allocator>::dynamic_map(std::size_t initial_capac
7979
template <typename Key, typename Value, cuda::thread_scope Scope, typename Allocator>
8080
void dynamic_map<Key, Value, Scope, Allocator>::reserve(std::size_t n, cudaStream_t stream)
8181
{
82-
std::int64_t num_elements_remaining = n;
83-
std::int32_t submap_idx = 0;
82+
std::size_t num_elements_remaining = n;
83+
std::size_t submap_idx = 0;
8484
while (num_elements_remaining > 0) {
8585
std::size_t submap_capacity;
8686

@@ -135,7 +135,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::insert(
135135

136136
reserve(size_ + num_to_insert, stream);
137137

138-
std::int32_t submap_idx = 0;
138+
std::size_t submap_idx = 0;
139139
while (num_to_insert > 0) {
140140
std::size_t capacity_remaining =
141141
max_load_factor_ * submaps_[submap_idx]->get_capacity() - submaps_[submap_idx]->get_size();
@@ -194,7 +194,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
194194
auto const grid_size = (tile_size * num_keys + stride * block_size - 1) / (stride * block_size);
195195

196196
// zero out submap success counters
197-
for (std::int32_t i = 0; i < submaps_.size(); ++i) {
197+
for (std::size_t i = 0; i < submaps_.size(); ++i) {
198198
CUCO_CUDA_TRY(cudaMemsetAsync(submap_num_successes_[i], 0, sizeof(atomic_ctr_type), stream));
199199
}
200200

@@ -209,7 +209,7 @@ void dynamic_map<Key, Value, Scope, Allocator>::erase(
209209
hash,
210210
key_equal);
211211

212-
for (std::int32_t i = 0; i < submaps_.size(); ++i) {
212+
for (std::size_t i = 0; i < submaps_.size(); ++i) {
213213
std::size_t* h_submap_num_successes;
214214
CUCO_CUDA_TRY(cudaMallocHost(&h_submap_num_successes, sizeof(std::size_t)));
215215
CUCO_CUDA_TRY(cudaMemcpyAsync(h_submap_num_successes,

0 commit comments

Comments
 (0)