NVIDIA
diff --git a/‎include/cuco/detail/bloom_filter/arrow_filter_policy.cuh‎
Lines changed: 15 additions & 12 deletions b/‎include/cuco/detail/bloom_filter/arrow_filter_policy.cuh‎
Lines changed: 15 additions & 12 deletions
diff --git a/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 13 additions & 12 deletions b/‎include/cuco/detail/bloom_filter/bloom_filter_impl.cuh‎
Lines changed: 13 additions & 12 deletions
diff --git a/‎include/cuco/detail/bloom_filter/default_filter_policy.inl‎
Lines changed: 9 additions & 9 deletions b/‎include/cuco/detail/bloom_filter/default_filter_policy.inl‎
Lines changed: 9 additions & 9 deletions
diff --git a/‎include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh‎
Lines changed: 20 additions & 18 deletions b/‎include/cuco/detail/bloom_filter/default_filter_policy_impl.cuh‎
Lines changed: 20 additions & 18 deletions
diff --git a/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 4 additions & 11 deletions b/‎include/cuco/detail/bloom_filter/kernels.cuh‎
Lines changed: 4 additions & 11 deletions
@@ -36,7 +36,7 @@ namespace cuco::detail {
  *
  * Example:
  * @code{.cpp}
- * template <typename KeyType, std::uint32_t NUM_FILTER_BLOCKS>
+ * template <typename KeyType, ::int NUM_FILTER_BLOCKS>
  * void bulk_insert_and_eval_arrow_policy_bloom_filter(device_vector<KeyType> const& positive_keys,
  *                                                 device_vector<KeyType> const& negative_keys)
  * {
@@ -84,18 +84,20 @@ template <class Key, template <typename> class XXHash64>
 class arrow_filter_policy {
  public:
   using hasher           = XXHash64<Key>;  ///< 64-bit XXHash hasher for Arrow bloom filter policy
-  using word_type        = std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
-  using key_type         = Key;            ///< Hash function input type
-  using hash_result_type = std::uint64_t;  ///< hash function output type
+  using word_type        = cuda::std::uint32_t;  ///< uint32_t for Arrow bloom filter policy
+  using key_type         = Key;                  ///< Hash function input type
+  using hash_result_type = cuda::std::uint64_t;  ///< hash function output type
 
-  static constexpr uint32_t bits_set_per_block = 8;  ///< hardcoded bits set per Arrow filter block
-  static constexpr uint32_t words_per_block    = 8;  ///< hardcoded words per Arrow filter block
+  static constexpr cuda::std::int32_t bits_set_per_block =
+    8;  ///< hardcoded bits set per Arrow filter block
+  static constexpr cuda::std::int32_t words_per_block =
+    8;  ///< hardcoded words per Arrow filter block
 
-  static constexpr std::uint32_t bytes_per_filter_block =
+  static constexpr cuda::std::int32_t bytes_per_filter_block =
     32;  ///< Number of bytes in one Arrow filter block
-  static constexpr std::uint32_t max_arrow_filter_bytes =
+  static constexpr cuda::std::int32_t max_arrow_filter_bytes =
     128 * 1024 * 1024;  ///< Max bytes in Arrow bloom filter
-  static constexpr std::uint32_t max_filter_blocks =
+  static constexpr cuda::std::int32_t max_filter_blocks =
     (max_arrow_filter_bytes /
      bytes_per_filter_block);  ///< Max sub-filter blocks allowed in Arrow bloom filter
 
@@ -153,10 +155,11 @@ class arrow_filter_policy {
    *
    * @return The bit pattern for the word/segment in the filter block
    */
-  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
+  __device__ constexpr word_type word_pattern(hash_result_type hash,
+                                              cuda::std::int32_t word_index) const
   {
     word_type const key = static_cast<word_type>(hash);
-    std::uint32_t salt;
+    cuda::std::int32_t salt;
 
     // Basically a switch (word_index) { case 0-7 ... }
     // First split: 0..3 versus 4..7.
@@ -186,4 +189,4 @@ class arrow_filter_policy {
   hasher hash_;
 };
 
-}  // namespace cuco::detail
+}  // namespace cuco::detail
@@ -29,6 +29,7 @@
 #include <cuda/std/__algorithm/min.h>  // TODO #include <cuda/std/algorithm> once available
 #include <cuda/std/array>
 #include <cuda/std/bit>
+#include <cuda/std/cstdint>
 #include <cuda/std/functional>
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
@@ -37,8 +38,6 @@
 
 #include <cooperative_groups.h>
 
-#include <cstdint>
-
 namespace cuco::detail {
 
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
@@ -138,7 +137,7 @@ class bloom_filter_impl {
   __device__ void add_impl(HashValue const& hash_value, BlockIndex block_index)
   {
 #pragma unroll words_per_block
-    for (uint32_t i = 0; i < words_per_block; ++i) {
+    for (cuda::std::int32_t i = 0; i < words_per_block; ++i) {
       auto const word = policy_.word_pattern(hash_value, i);
       if (word != 0) {
         auto atom_word = cuda::atomic_ref<word_type, thread_scope>{
@@ -200,7 +199,7 @@ class bloom_filter_impl {
           block_index = policy_.block_index(hash_value, num_blocks_);
         }
 
-        for (uint32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) {
+        for (cuda::std::int32_t j = 0; (j < num_threads) and (i + j < num_keys); ++j) {
           this->add_impl(group, group.shfl(hash_value, j), group.shfl(block_index, j));
         }
       }
@@ -220,7 +219,9 @@ class bloom_filter_impl {
           block_index = policy_.block_index(hash_value, num_blocks_);
         }
 
-        for (uint32_t j = 0; (j < worker_num_threads) and (i + worker_offset + j < num_keys); ++j) {
+        for (cuda::std::int32_t j = 0;
+             (j < worker_num_threads) and (i + worker_offset + j < num_keys);
+             ++j) {
           this->add_impl(
             worker_group, worker_group.shfl(hash_value, j), worker_group.shfl(block_index, j));
         }
@@ -318,7 +319,7 @@ class bloom_filter_impl {
       policy_.block_index(hash_value, num_blocks_) * words_per_block);
 
 #pragma unroll words_per_block
-    for (uint32_t i = 0; i < words_per_block; ++i) {
+    for (cuda::std::int32_t i = 0; i < words_per_block; ++i) {
       auto const expected_pattern = policy_.word_pattern(hash_value, i);
       if ((stored_pattern[i] & expected_pattern) != expected_pattern) { return false; }
     }
@@ -342,12 +343,12 @@ class bloom_filter_impl {
       bool success          = true;
 
 #pragma unroll
-      for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
+      for (cuda::std::int32_t i = rank; i < optimal_num_threads; i += num_threads) {
         auto const thread_offset  = i * words_per_thread;
         auto const stored_pattern = this->vec_load_words<words_per_thread>(
           policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
 #pragma unroll words_per_thread
-        for (uint32_t j = 0; j < words_per_thread; ++j) {
+        for (cuda::std::int32_t j = 0; j < words_per_thread; ++j) {
           auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
           if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
         }
@@ -430,25 +431,25 @@ class bloom_filter_impl {
   // TODO
   // [[nodiscard]] __host__ double occupancy() const;
   // [[nodiscard]] __host__ double expected_false_positive_rate(size_t unique_keys) const
-  // [[nodiscard]] __host__ __device__ static uint32_t optimal_pattern_bits(size_t num_blocks)
+  // [[nodiscard]] __host__ __device__ static int32_t optimal_pattern_bits(size_t num_blocks)
   // template <typename CG, cuda::thread_scope NewScope = thread_scope>
   // [[nodiscard]] __device__ constexpr auto make_copy(CG const& group, word_type* const
   // memory_to_use, cuda_thread_scope<NewScope> scope = {}) const noexcept;
 
  private:
-  template <uint32_t NumWords>
+  template <::int NumWords>
   __device__ constexpr cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
   {
     return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
       words_ + index, cuda::std::min(sizeof(word_type) * NumWords, max_vec_bytes())));
   }
 
-  [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
+  [[nodiscard]] __host__ __device__ static constexpr cuda::std::int32_t add_optimal_cg_size()
   {
     return words_per_block;  // one thread per word so atomic updates can be coalesced
   }
 
-  [[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size()
+  [[nodiscard]] __host__ __device__ static constexpr cuda::std::int32_t contains_optimal_cg_size()
   {
     constexpr auto word_bytes  = sizeof(word_type);
     constexpr auto block_bytes = word_bytes * words_per_block;
 
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,27 +16,27 @@
 
 #pragma once
 
-#include <cstdint>
+#include <cuda/std/cstdint>
 
 namespace cuco {
 
-template <class Hash, class Word, uint32_t WordsPerBlock>
+template <class Hash, class Word, ::int WordsPerBlock>
 __host__
   __device__ constexpr default_filter_policy<Hash, Word, WordsPerBlock>::default_filter_policy(
-    uint32_t pattern_bits, Hash hash)
+    cuda::std::int32_t pattern_bits, Hash hash)
   : impl_{pattern_bits, hash}
 {
 }
 
-template <class Hash, class Word, uint32_t WordsPerBlock>
+template <class Hash, class Word, ::int WordsPerBlock>
 __device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type
 default_filter_policy<Hash, Word, WordsPerBlock>::hash(
   typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_argument_type const& key) const
 {
   return impl_.hash(key);
 }
 
-template <class Hash, class Word, uint32_t WordsPerBlock>
+template <class Hash, class Word, ::int WordsPerBlock>
 template <class Extent>
 __device__ constexpr auto default_filter_policy<Hash, Word, WordsPerBlock>::block_index(
   typename default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
@@ -45,13 +45,13 @@ __device__ constexpr auto default_filter_policy<Hash, Word, WordsPerBlock>::bloc
   return impl_.block_index(hash, num_blocks);
 }
 
-template <class Hash, class Word, uint32_t WordsPerBlock>
+template <class Hash, class Word, ::int WordsPerBlock>
 __device__ constexpr typename default_filter_policy<Hash, Word, WordsPerBlock>::word_type
 default_filter_policy<Hash, Word, WordsPerBlock>::word_pattern(
   default_filter_policy<Hash, Word, WordsPerBlock>::hash_result_type hash,
-  std::uint32_t word_index) const
+  cuda::std::int32_t word_index) const
 {
   return impl_.word_pattern(hash, word_index);
 }
 
-}  // namespace cuco
+}  // namespace cuco
@@ -19,31 +19,31 @@
 #include <cuco/detail/error.hpp>
 
 #include <cuda/std/bit>
+#include <cuda/std/cstdint>
 #include <cuda/std/limits>
 #include <cuda/std/tuple>
 #include <cuda/std/type_traits>
 
-#include <cstdint>
 #include <nv/target>
 
 namespace cuco::detail {
 
-template <class Hash, class Word, uint32_t WordsPerBlock>
+template <class Hash, class Word, ::int WordsPerBlock>
 class default_filter_policy_impl {
  public:
   using hasher             = Hash;
   using word_type          = Word;
   using hash_argument_type = typename hasher::argument_type;
   using hash_result_type   = decltype(std::declval<hasher>()(std::declval<hash_argument_type>()));
 
-  static constexpr std::uint32_t words_per_block = WordsPerBlock;
+  static constexpr cuda::std::int32_t words_per_block = WordsPerBlock;
 
  private:
-  static constexpr std::uint32_t word_bits       = cuda::std::numeric_limits<word_type>::digits;
-  static constexpr std::uint32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
+  static constexpr cuda::std::int32_t word_bits = cuda::std::numeric_limits<word_type>::digits;
+  static constexpr cuda::std::int32_t bit_index_width = cuda::std::bit_width(word_bits - 1);
 
  public:
-  __host__ __device__ explicit constexpr default_filter_policy_impl(uint32_t pattern_bits,
+  __host__ __device__ explicit constexpr default_filter_policy_impl(cuda::std::int32_t pattern_bits,
                                                                     Hash hash)
     : pattern_bits_{pattern_bits},
       min_bits_per_word_{pattern_bits_ / words_per_block},
@@ -54,14 +54,14 @@ class default_filter_policy_impl {
       NV_IS_HOST,
       (  // This ensures each word in the block has at least one bit set; otherwise we would never
          // use some of the words
-        constexpr uint32_t min_pattern_bits = words_per_block;
+        constexpr cuda::int32_t min_pattern_bits = words_per_block;
 
         // The maximum number of bits to be set for a key is capped by the total number of bits in
         // the filter block
-        constexpr uint32_t max_pattern_bits = word_bits * words_per_block;
+        constexpr cuda::int32_t max_pattern_bits = word_bits * words_per_block;
 
-        constexpr uint32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
-        constexpr uint32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
+        constexpr cuda::int32_t hash_bits = cuda::std::numeric_limits<hash_result_type>::digits;
+        constexpr cuda::int32_t max_pattern_bits_from_hash = hash_bits / bit_index_width;
         CUCO_EXPECTS(
           pattern_bits <= max_pattern_bits_from_hash,
           "`hash_result_type` too narrow to generate the requested number of `pattern_bits`");
@@ -85,7 +85,8 @@ class default_filter_policy_impl {
     return hash % num_blocks;
   }
 
-  __device__ constexpr word_type word_pattern(hash_result_type hash, std::uint32_t word_index) const
+  __device__ constexpr word_type word_pattern(hash_result_type hash,
+                                              cuda::std::int32_t word_index) const
   {
     word_type constexpr bit_index_mask = (word_type{1} << bit_index_width) - 1;
 
@@ -94,10 +95,11 @@ class default_filter_policy_impl {
 
     hash >>= bits_so_far * bit_index_width;
 
-    word_type word              = 0;
-    int32_t const bits_per_word = min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
+    word_type word = 0;
+    cuda::std::int32_t const bits_per_word =
+      min_bits_per_word_ + (word_index < remainder_bits_ ? 1 : 0);
 
-    for (int32_t bit = 0; bit < bits_per_word; ++bit) {
+    for (cuda::std::int32_t bit = 0; bit < bits_per_word; ++bit) {
       word |= word_type{1} << (hash & bit_index_mask);
       hash >>= bit_index_width;
     }
@@ -106,10 +108,10 @@ class default_filter_policy_impl {
   }
 
  private:
-  uint32_t pattern_bits_;
-  uint32_t min_bits_per_word_;
-  uint32_t remainder_bits_;
+  cuda::std::int32_t pattern_bits_;
+  cuda::std::int32_t min_bits_per_word_;
+  cuda::std::int32_t remainder_bits_;
   hasher hash_;
 };
 
-}  // namespace cuco::detail
+}  // namespace cuco::detail
@@ -21,13 +21,11 @@
 
 #include <cooperative_groups.h>
 
-#include <cstdint>
-
 namespace cuco::detail::bloom_filter_ns {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
-template <int32_t BlockSize, class InputIt, class Ref>
+template <::int BlockSize, class InputIt, class Ref>
 CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
                                                   cuco::detail::index_type n,
                                                   Ref ref)
@@ -49,12 +47,7 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add(InputIt first,
   ref.add(tile, first + tile_start, first + tile_stop);
 }
 
-template <int32_t CGSize,
-          int32_t BlockSize,
-          class InputIt,
-          class StencilIt,
-          class Predicate,
-          class Ref>
+template <::int CGSize, ::int BlockSize, class InputIt, class StencilIt, class Predicate, class Ref>
 CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
@@ -75,8 +68,8 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   }
 }
 
-template <int32_t CGSize,
-          int32_t BlockSize,
+template <::int CGSize,
+          ::int BlockSize,
           class InputIt,
           class StencilIt,
           class Predicate,