Skip to content

Commit 6816740

Browse files
authored
Implement OA retrieve(_outer) and its multiset API (#537)
1 parent dafcf45 commit 6816740

File tree

20 files changed

+1189
-16
lines changed

20 files changed

+1189
-16
lines changed

README.md

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,13 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
227227
#### Examples:
228228
- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJylVgtv2zYQ_isHDUXtVJYfaFDEjQN4bYoZK5whTlsUcaHQFG0TkUmNpOx6hv_77ijJlpsM67AWiCHe-7vvjtwFVlgrtbJB_34XyCTod8MgZWqRs4UI-gHPExaEgdW54fTdPpsqOIN3OtsauVg6aPAm9Dq9bgv_vA5h_Hn0fjSEdze3f9zcDu9GN-OIDLzRR8mFsiKBXCXCgFsKGGaM408pCeGzMJQN9KIONEhhGpSyadB8671sdQ4rtgWlHeRWoBtpYS5TAeI7F5kDqYDrVZZKpriAjXRLH6r049OBr6UTPXMM9RlaZPg1r2sCc4fU6d_Suazfbm82m4j5tCNtFu20ULbtj6N31-PJdQtTP5h9UinCC0b8mUuDhc-2wDLMjLMZ5puyDWgDbGEEypymzDdGOqkWIVg9dxtmhPeTSOuMnOXuBLwqT6y_roDwMYXADScwmkwD-HU4GU1C7-fL6O63m0938GV4ezsc342uJ3Bzi80avx9Rq_DrAwzHX-H30fh9CAKhw1Die2aoCkxVEqwiKTCcCHGSxlwXadlMcDmXHCoawUKvhVFYFmTCrGRBOEwy8X5SuZKOOX_2pDgfqj1VU_WLVDzNEwGXPOe6bcmEx6s8dXLFsojny6tTNbc0uXXtRKzRVbwW3GkTkdITFemEYShtc50rgj-uTp7Xt9hPgeR6XuoMUxbBWEU_ZuQrtf5QKocElKqx1jJpTtUO60Q6E0aPYhu7bSaQcgOkhHt7FK1ZmotCWIlIeLAQq8xtY_q0ggoRqWfuAFpd76VmX6gWBwflUpFUObbDUevBuqTft_IvNIQxqpx3XnY6nVKt3cZdgJom5w5bClU_irnrdjoh6oJNNUqLGlpd6rz_sb7TPhOqoe2zKb1WOdkIxtoVTOM0dxKVkfF8qVEFHpXekNcNTXqaIihWGIc5-rjo1IalQ6InzkWqWQJzRmSgeT_vvIh8ucgprPKUVJcVsGENuSvsW-bbBQjHGfTC0vgA_u5pG_anSt7b7rkW7PclsAWZ-v0T-l4WTjImzfO5XQHJbGPcrPXHCIYAMqhoS3V7tQiuBS4yggmW1I2NxuIcrjYblcbX0SIKYbdDPLGGXTfs4k8URYAnvXMiQnEM_qMU7usFHMahUWBWna_Yo4ifDNwlsvqq0WmGP69clEIEbTQru-JsJhY4YdXZ_TeI4xLOuMFyWrhN2OF2drlR8G_Q7iS8gMYY2tBrhiD3b2FfA3nkeYcTgBz0wWk8tWct0oWUaEkV7GycZlcmK1SC-R89TurdMnomPJsJdyC4iy68vnh5cXGx_2fCVKVceevY6XguMZDnx9GmYkajrnRM8OTU5xlCp84wakwxzJrz3BjyZSlxnzJea_eYdbE3mrRZ6NLFmwoFHqBqh5fu7vDsIdZ4mZkHsPl8Lr-jaoK3phMUhbkfQlEkhg8C1fLspfXQ9c58k_0mA3SX5S4mmuAO8zcGZV2E-em6m_9zOpMYr1Is1zZq-dSQ_ICRChI9bfbe702LgRAFZq3mktF9f9icdeI9HCI9VAwljGjSH-pl3ctvD5BoYdVLhzc9PiPCmi1KIysQwAQGg2evjIcDzGVBgnQ9vjhXRoq1-I8Qh0eQKrWCrD7KwWnZyFrU1lPDI7DEKacdS0Hlq5nwu9_nVS08sEudp_hEQ-6N4RX4QS-qs35o8b3ZOCHR4MdkBnAwPBClXC50XeKU4kuanqb4WDXHB3eg1px3e-d5F8U6c8VrPGjhvTvgr15130CLGb4c2FX8pgOtFl7KDv84rFkkrZStZv6JnspZzSfnPMXDdfGexgO8odVjsA8rOVL1RI5MDvbf_P-_AYKkJA4=))
229229

230+
### `static_multiset`
231+
232+
`cuco::static_multiset` is a fixed-size container that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multiset.cuh` for more detailed information.
233+
234+
#### Examples:
235+
- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multiset/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJyVVw1vGkcQ_SuTqypDcnxZjSIRuyq1HRU1wpFxEkUhwsveACvf3dLdPQi1_N87s3sHhz_a1JZsuJ19--bNzFu4iyxaq3Ruo_7Xu0glUb8XR6nIF4VYYNSPZJGIKI6sLozk952XkxxewplebY1aLB00ZBOOu8e_xDD6NDwfDuDs8urD5dXgeng5anOsj3-vJOYWEyjyBA24JcJgJST9K1di-ISGicBxuwsNDphE5dokar71KFtdQCa2kGsHhUWCURbmKkXA7xJXDlQOUmerVIlcImyUW_qjShxPB76UIHrmBMUL2rGid_N6JAi3o84_S-dW_U5ns9m0hafd1mbRSUOw7bwfnl2Mxhctor7b9jFPSVkw-FehDCU-24JYETMpZsQ3FRvQBsTCIK05zcw3RjmVL2Kweu42wqDHSZR1Rs0KdyBexZPyrweQfCIn4QZjGI4nEfw-GA_Hscf5PLz-4_LjNXweXF0NRtfDizFcXlGxRudDLhW9eweD0Rf4czg6jwFJOjoKv68MZ0FUFcuKSdBwjHhAY64DLbtCqeZKQtVBsNBrNDmlBSs0mQq9RiQTj5OqTDnh_LNHyfmjOpN8kv-kcpkWCcKJLKTuWN4ip1mROmXRtWWx_PUwzC1NYV0nwTVBTdconTZtDnoUMi9yyeeL9On1VC-oZM8sWiouUqe1H56vNNUERXawySdrfWTnZWiS33zzLil6OivS2yl-FyQzUkZheWYUzuEcMxLIGeGQZLIsa9muD5SgyjMMFZ4hYfBhaPcjeE3xvOofA7UXdUu6pcZb69CTc6Mzj-o3U4l8UMFTy_VNNFdRUxnLgqXqFuFGUaWMu6EW8afcGKRuxDXe-GaEHTVifIvbPR8qrModTaLKG2utkuYkv6PndB4fQ6FTt10hnBJB95Y1A-h04CJbuS3YVLuQgkFuUMxdmDB-bdb0ehLxQ5VjSmKsRVqgbbMC1KvhHdilLlLaRCJiSg1Cm2whl-U5bikca0FeQ7kY0FIWhoeULIj_rwoHiXCC09mTlVwmnhlA5jnlhYoHZdLq7RMZFdmMcYMq7ADEJGiJPBtU2qTft-pvnLoabl5kU7_hFF53j7rd7h7xjIyPTAAkO5QilWbClpZAsT9DqgUVUvAo8A5R0Jl7YF6dhlXC7rZfv32SxB79NCxKVGljx6pTx2nWuXH_FpKrBtwN3ppJ4hQFgd5UqDdlaUMTtHogrO9IL2dQqtLTK892QAQPp-Ckqsevu-67qw6Iyy27-tw9rtT9fY04TTFpyqTDqO8qdteNoRfDcQztdgzqvtzwGY8Mux7z97bOFQWkO4O3gduQIZHBw6Ycv5z9_6gm4PGRd3RyDbcbGAh-0-8fGFotTw6sV-HYX5i7bRX5hgec4YJmrhkHeMwTft2tVWvoSVOp0jTkSiOogzMIu-T6ceDOfkOOz2IHKjvY4FsiQ6i5Al2E5AN88wWp6gpUkxpGMAhIg8gwFYX_S2fXlUXuQoN5cMMaWeaUCSeXpQ9Z7nz-qECFenBu-5kZYVxMprpwNJFTXqNx2V9XvPwj_MZUZDZ7dl9yNtr_I71Qnroyeoa28QSXw974DxivxPM4JdOr0vR9yxyId2g18PWA3ZQyjmH6DU45rFbD6hL5l4Y9ANqHHNKunjefcbPqnIe1esQSWk8fWEqg5tBoPAN2-lQ_NP39-vyWapabTbgL2gS3pTA4OaHrbVxI6lX7At4ReLIb1faECEU-23vAlK67p7e_Eyp9AWOdofOl2pD30QdQnS_qCCE5IlmYHPiquaevAvwBmz61mP03hihfS9k7fl30aFmvXPg6EbXozFP56lXvDbSEkctTm03fdKHVogvH0R9HumLSSkU2898xUjWrYUopU3q4Dt8K6AHdH_ltdB9X6-TjB-vUyNH9N__7D2OnfWU=))
236+
230237
### `dynamic_map`
231238

232239
`cuco::dynamic_map` links together multiple `cuco::static_map`s to provide a hash table that can grow as key-value pairs are inserted. It currently only provides host-bulk APIs. See the Doxygen documentation in `dynamic_map.cuh` for more detailed information.

benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,6 +69,7 @@ ConfigureBench(STATIC_MAP_BENCH
6969
# - static_multiset benchmarks --------------------------------------------------------------------
7070
ConfigureBench(STATIC_MULTISET_BENCH
7171
static_multiset/contains_bench.cu
72+
static_multiset/retrieve_bench.cu
7273
static_multiset/count_bench.cu
7374
static_multiset/find_bench.cu
7475
static_multiset/insert_bench.cu)
Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmark_defaults.hpp>
18+
#include <benchmark_utils.hpp>
19+
20+
#include <cuco/static_multiset.cuh>
21+
#include <cuco/utility/key_generator.cuh>
22+
23+
#include <nvbench/nvbench.cuh>
24+
25+
#include <thrust/device_vector.h>
26+
#include <thrust/transform.h>
27+
28+
using namespace cuco::benchmark;
29+
using namespace cuco::utility;
30+
31+
/**
32+
* @brief A benchmark evaluating `cuco::static_multiset::retrieve` performance
33+
*/
34+
template <typename Key, typename Dist>
35+
void static_multiset_retrieve(nvbench::state& state, nvbench::type_list<Key, Dist>)
36+
{
37+
auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N);
38+
auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY);
39+
auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE);
40+
41+
std::size_t const size = num_keys / occupancy;
42+
43+
thrust::device_vector<Key> keys(num_keys);
44+
45+
key_generator gen;
46+
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
47+
48+
gen.dropout(keys.begin(), keys.end(), matching_rate);
49+
50+
state.add_element_count(num_keys);
51+
52+
cuco::static_multiset<Key> set{size, cuco::empty_key<Key>{-1}};
53+
set.insert(keys.begin(), keys.end());
54+
55+
auto const output_size = set.count(keys.begin(), keys.end());
56+
thrust::device_vector<Key> output_match(output_size);
57+
auto output_probe_begin = thrust::discard_iterator{};
58+
59+
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
60+
set.retrieve(
61+
keys.begin(), keys.end(), output_probe_begin, output_match.begin(), {launch.get_stream()});
62+
});
63+
}
64+
65+
NVBENCH_BENCH_TYPES(static_multiset_retrieve,
66+
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
67+
nvbench::type_list<distribution::uniform>))
68+
.set_name("static_multiset_retrieve_uniform_occupancy")
69+
.set_type_axes_names({"Key", "Distribution"})
70+
.set_max_noise(defaults::MAX_NOISE)
71+
.add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE);
72+
73+
NVBENCH_BENCH_TYPES(static_multiset_retrieve,
74+
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
75+
nvbench::type_list<distribution::uniform>))
76+
.set_name("static_multiset_retrieve_uniform_matching_rate")
77+
.set_type_axes_names({"Key", "Distribution"})
78+
.set_max_noise(defaults::MAX_NOISE)
79+
.add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE);
80+
81+
NVBENCH_BENCH_TYPES(static_multiset_retrieve,
82+
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
83+
nvbench::type_list<distribution::uniform>))
84+
.set_name("static_multiset_retrieve_uniform_multiplicity")
85+
.set_type_axes_names({"Key", "Distribution"})
86+
.set_max_noise(defaults::MAX_NOISE)
87+
.add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE);

examples/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/stat
3838
ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu")
3939
ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu")
4040
ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu")
41+
ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu")
4142
ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu")
4243
ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu")
4344
ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
/*
2+
* Copyright (c) 2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <cuco/static_multiset.cuh>
18+
19+
#include <thrust/device_vector.h>
20+
#include <thrust/functional.h>
21+
#include <thrust/logical.h>
22+
#include <thrust/sequence.h>
23+
24+
#include <iostream>
25+
#include <limits>
26+
27+
/**
28+
* @file host_bulk_example.cu
29+
* @brief Demonstrates usage of the static_multiset "bulk" host APIs.
30+
*
31+
* The bulk APIs are only invocable from the host and are used for doing operations like `insert` or
32+
* `retrieve` on a multiset of keys.
33+
*
34+
*/
35+
int main(void)
36+
{
37+
using key_type = int;
38+
39+
// Empty slots are represented by reserved "sentinel" values. These values should be selected such
40+
// that they never occur in your input data.
41+
key_type constexpr empty_key_sentinel = -1;
42+
43+
// Number of keys to be inserted
44+
std::size_t constexpr num_keys = 50'000;
45+
46+
// Compute capacity based on a 50% load factor
47+
auto constexpr load_factor = 0.5;
48+
std::size_t const capacity = std::ceil(num_keys / load_factor);
49+
50+
// Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel.
51+
cuco::static_multiset<key_type> multiset{capacity, cuco::empty_key{empty_key_sentinel}};
52+
53+
// Create a sequence of keys {0, 1, 2, .., i}
54+
// We're going to insert each key twice so we only need 'num_keys / 2' distinct keys.
55+
thrust::device_vector<key_type> keys(num_keys / 2);
56+
thrust::sequence(keys.begin(), keys.end(), 0);
57+
58+
// Inserts all keys into the hash set
59+
multiset.insert(keys.begin(), keys.end());
60+
// Insert the same set of keys again, so each distinct key should occur twice in the multiset
61+
multiset.insert(keys.begin(), keys.end());
62+
63+
// Counts the occurrences of matching keys contained in the multiset.
64+
std::size_t const counted_output_size = multiset.count(keys.begin(), keys.end());
65+
66+
// Storage for result
67+
thrust::device_vector<key_type> output_probes(counted_output_size);
68+
thrust::device_vector<key_type> output_matches(counted_output_size);
69+
70+
// Retrieve all matching keys
71+
auto const [output_probes_end, _] =
72+
multiset.retrieve(keys.begin(), keys.end(), output_probes.begin(), output_matches.begin());
73+
std::size_t const retrieved_output_size = output_probes_end - output_probes.begin();
74+
75+
if ((retrieved_output_size == counted_output_size) and (retrieved_output_size == num_keys)) {
76+
std::cout << "Success! Found all keys.\n";
77+
} else {
78+
std::cout << "Fail! Something went wrong.\n";
79+
}
80+
81+
return 0;
82+
}

include/cuco/detail/extent/extent.inl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
#include <cuco/detail/error.hpp>
2020
#include <cuco/detail/prime.hpp> // TODO move to detail/extent/
21-
#include <cuco/detail/utility/math.hpp>
21+
#include <cuco/detail/utility/math.cuh>
2222
#include <cuco/detail/utils.hpp>
2323
#include <cuco/utility/fast_int.cuh>
2424

include/cuco/detail/open_addressing/kernels.cuh

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -399,6 +399,77 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find(InputIt first,
399399
}
400400
}
401401

402+
/**
403+
* @brief Retrieves the equivalent container elements of all keys in the range `[input_probe,
404+
* input_probe + n)`.
405+
*
406+
* If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to
407+
* `output_probe` and associated slot contents to `output_match`, respectively. The output order is
408+
* unspecified.
409+
*
410+
* @tparam IsOuter Flag indicating whether it's an outer count or not
411+
* @tparam block_size The size of the thread block
412+
* @tparam InputProbeIt Device accessible input iterator
413+
* @tparam OutputProbeIt Device accessible input iterator whose `value_type` is
414+
* convertible to the `InputProbeIt`'s `value_type`
415+
* @tparam OutputMatchIt Device accessible input iterator whose `value_type` is
416+
* convertible to the container's `value_type`
417+
* @tparam AtomicCounter Integral atomic type that follows the same semantics as
418+
* `cuda::(std::)atomic(_ref)`
419+
* @tparam Ref Type of non-owning device ref allowing access to storage
420+
*
421+
* @param input_probe Beginning of the sequence of input keys
422+
* @param n Number of the keys to query
423+
* @param output_probe Beginning of the sequence of keys corresponding to matching elements in
424+
* `output_match`
425+
* @param output_match Beginning of the sequence of matching elements
426+
* @param atomic_counter Pointer to an atomic object of integral type that is used to count the
427+
* number of output elements
428+
* @param ref Non-owning container device ref used to access the slot storage
429+
*/
430+
template <bool IsOuter,
431+
int32_t BlockSize,
432+
class InputProbeIt,
433+
class OutputProbeIt,
434+
class OutputMatchIt,
435+
class AtomicCounter,
436+
class Ref>
437+
CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe,
438+
cuco::detail::index_type n,
439+
OutputProbeIt output_probe,
440+
OutputMatchIt output_match,
441+
AtomicCounter* atomic_counter,
442+
Ref ref)
443+
{
444+
namespace cg = cooperative_groups;
445+
446+
auto const block = cg::this_thread_block();
447+
auto constexpr tiles_in_block = BlockSize / Ref::cg_size;
448+
// make sure all but the last block are always occupied
449+
auto const items_per_block = detail::int_div_ceil(n, tiles_in_block * gridDim.x) * tiles_in_block;
450+
451+
auto const block_begin_offset = block.group_index().x * items_per_block;
452+
auto const block_end_offset = min(n, block_begin_offset + items_per_block);
453+
454+
if (block_begin_offset < block_end_offset) {
455+
if constexpr (IsOuter) {
456+
ref.retrieve_outer<BlockSize>(block,
457+
input_probe + block_begin_offset,
458+
input_probe + block_end_offset,
459+
output_probe,
460+
output_match,
461+
*atomic_counter);
462+
} else {
463+
ref.retrieve<BlockSize>(block,
464+
input_probe + block_begin_offset,
465+
input_probe + block_end_offset,
466+
output_probe,
467+
output_match,
468+
*atomic_counter);
469+
}
470+
}
471+
}
472+
402473
/**
403474
* @brief Inserts all elements in the range `[first, last)`.
404475
*

0 commit comments

Comments
 (0)