Skip to content

Commit 142ac06

Browse files
committed
v2
1 parent 77a4c1d commit 142ac06

File tree

7 files changed

+190
-154
lines changed

7 files changed

+190
-154
lines changed

benchmarks/roaring_bitmap/contains_bench.cu

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,8 @@ void roaring_bitmap_contains(nvbench::state& state)
5353
file.read(buffer, file_size);
5454
file.close();
5555

56-
cuda::std::span<cuda::std::byte const> bitmap(reinterpret_cast<cuda::std::byte const*>(buffer),
57-
file_size);
58-
cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(bitmap);
56+
cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(
57+
reinterpret_cast<cuda::std::byte const*>(buffer));
5958

6059
std::vector<cuda::std::uint32_t> keys;
6160
for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {

examples/roaring_bitmap/host_bulk_example.cu

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,8 @@ int main(int argc, char* argv[])
3838
file.read(buffer, file_size);
3939
file.close();
4040

41-
cuda::std::span<cuda::std::byte const> bitmap(reinterpret_cast<cuda::std::byte const*>(buffer),
42-
file_size);
43-
cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(bitmap);
41+
cuco::roaring_bitmap<cuda::std::uint32_t> roaring_bitmap(
42+
reinterpret_cast<cuda::std::byte const*>(buffer));
4443

4544
std::vector<cuda::std::uint32_t> keys;
4645
for (cuda::std::uint32_t k = 0; k < 100000; k += 1000) {
@@ -58,13 +57,18 @@ int main(int argc, char* argv[])
5857

5958
roaring_bitmap.contains(keys_d.begin(), keys_d.end(), contained.begin());
6059

60+
size_t num_errors = 0;
6161
for (size_t i = 0; i < keys.size(); i++) {
6262
if (not contained[i]) {
63-
std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl;
63+
if (num_errors <= 10) {
64+
std::cout << "Error: " << keys_d[i] << " is not contained" << std::endl;
65+
}
66+
num_errors++;
6467
}
6568
}
69+
if (num_errors > 0) { std::cout << "num_errors: " << num_errors << std::endl; }
6670

67-
// check if all elements are contained
71+
// check if all elements are contained and written to output
6872
bool all_contained = thrust::all_of(contained.begin(), contained.end(), ::cuda::std::identity{});
6973
std::cout << "all_contained: " << all_contained << std::endl;
7074

include/cuco/detail/roaring_bitmap/roaring_bitmap.inl

Lines changed: 12 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -26,25 +26,20 @@
2626
namespace cuco {
2727

2828
template <class T, cuda::thread_scope Scope, class Allocator>
29-
__host__ roaring_bitmap<T, Scope, Allocator>::roaring_bitmap(
30-
cuda::std::span<cuda::std::byte const> compressed_bitmap,
31-
cuda_thread_scope<Scope> scope,
32-
Allocator const& alloc,
33-
cuda::stream_ref stream)
29+
__host__ roaring_bitmap<T, Scope, Allocator>::roaring_bitmap(cuda::std::byte const* bitmap,
30+
cuda_thread_scope<Scope> scope,
31+
Allocator const& alloc,
32+
cuda::stream_ref stream)
3433
: allocator_{alloc},
35-
data_{allocator_.allocate(compressed_bitmap.size()),
36-
detail::custom_deleter<cuda::std::size_t, allocator_type>{compressed_bitmap.size(),
37-
allocator_}},
38-
ref_{compressed_bitmap,
39-
cuda::std::span<cuda::std::byte const>(data_.get(), compressed_bitmap.size()),
40-
scope} // TODO move after memcpy?
34+
metadata_{ref_type<>::read_metadata(bitmap)},
35+
data_{
36+
allocator_.allocate(metadata_.size_bytes),
37+
detail::custom_deleter<cuda::std::size_t, allocator_type>{metadata_.size_bytes, allocator_}},
38+
ref_{data_.get(), metadata_, scope}
4139
{
42-
CUCO_CUDA_TRY(cudaMemcpyAsync(data_.get(),
43-
compressed_bitmap.data(),
44-
compressed_bitmap.size(),
45-
cudaMemcpyHostToDevice,
46-
stream.get()));
47-
stream.wait(); // TODO check if this is necessary
40+
CUCO_CUDA_TRY(cudaMemcpyAsync(
41+
data_.get(), bitmap, metadata_.size_bytes, cudaMemcpyHostToDevice, stream.get()));
42+
// stream.wait(); // TODO check if this is necessary
4843
}
4944

5045
template <class T, cuda::thread_scope Scope, class Allocator>

include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh

Lines changed: 141 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,30 @@
3030
#include <thrust/fill.h>
3131
#include <thrust/transform.h>
3232

33+
#include <nv/target>
34+
3335
namespace cuco::detail {
3436

37+
template <class T>
38+
struct roaring_bitmap_metadata {
39+
static_assert(cuco::dependent_false<T>, "T must be either uint32_t or uint64_t");
40+
};
41+
42+
template <>
43+
struct roaring_bitmap_metadata<cuda::std::uint32_t> {
44+
cuda::std::size_t size_bytes = 0;
45+
cuda::std::size_t num_keys = 0;
46+
cuda::std::size_t run_container_bitmap = 0;
47+
cuda::std::size_t key_cards = 0;
48+
cuda::std::size_t container_offsets = 0;
49+
cuda::std::int32_t num_containers = 0;
50+
bool has_run = false;
51+
bool offsets_aligned = false;
52+
bool valid = false;
53+
};
54+
55+
// TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
56+
3557
// primary template
3658
template <class T, cuda::thread_scope Scope>
3759
class roaring_bitmap_impl {
@@ -48,22 +70,33 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
4870
static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value
4971

5072
public:
73+
using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t>;
5174
static constexpr auto thread_scope = Scope;
5275

53-
__host__ roaring_bitmap_impl(cuda::std::span<cuda::std::byte const> compressed_bitmap_h,
54-
cuda::std::span<cuda::std::byte const> compressed_bitmap_d,
55-
cuda_thread_scope<Scope> /* scope */)
56-
: data_{compressed_bitmap_d}
76+
__host__ __device__ roaring_bitmap_impl(cuda::std::byte const* bitmap,
77+
metadata_type metadata,
78+
cuda_thread_scope<Scope> /* scope */)
5779
{
58-
bool success = this->read_header(compressed_bitmap_h);
59-
CUCO_EXPECTS(success, "Failed to read compressed bitmap");
80+
NV_IF_TARGET(
81+
NV_IS_HOST,
82+
CUCO_EXPECTS(metadata.valid, "Invalid bitmap format");) // TODO device error handling
83+
84+
if (metadata.valid) {
85+
data_ = cuda::std::span<cuda::std::byte const>{bitmap, metadata.size_bytes};
86+
size_ = metadata.num_keys;
87+
num_containers_ = metadata.num_containers;
88+
run_container_bitmap_ =
89+
reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap);
90+
key_cards_ = reinterpret_cast<cuda::std::uint16_t const*>(bitmap + metadata.key_cards);
91+
offsets_ = reinterpret_cast<cuda::std::byte const*>(bitmap + metadata.container_offsets);
92+
offsets_aligned_ = metadata.offsets_aligned;
93+
has_run_ = metadata.has_run;
94+
}
6095
}
6196

62-
__device__ roaring_bitmap_impl(cuda::std::span<cuda::std::byte const> compressed_bitmap,
63-
cuda_thread_scope<Scope> /* scope */)
64-
: data_{compressed_bitmap}
97+
__device__ roaring_bitmap_impl(cuda::std::byte const* bitmap, cuda_thread_scope<Scope> scope)
98+
: roaring_bitmap_impl(bitmap, read_metadata(bitmap), scope)
6599
{
66-
this->read_header(compressed_bitmap); // TODO error handling
67100
}
68101

69102
template <class InputIt, class OutputIt>
@@ -135,19 +168,102 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
135168
return data_;
136169
}
137170

171+
__host__ __device__ static metadata_type const read_metadata(
172+
cuda::std::byte const* bitmap) noexcept
173+
{
174+
cuda::std::byte const* buf = bitmap;
175+
metadata_type metadata;
176+
177+
cuda::std::uint32_t cookie;
178+
cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
179+
buf += sizeof(cuda::std::uint32_t);
180+
if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
181+
metadata.valid = false;
182+
return metadata;
183+
}
184+
185+
if ((cookie & 0xFFFF) == serial_cookie)
186+
metadata.num_containers = (cookie >> 16) + 1;
187+
else {
188+
cuda::std::memcpy(&metadata.num_containers, buf, sizeof(cuda::std::uint32_t));
189+
buf += sizeof(cuda::std::uint32_t);
190+
}
191+
if (metadata.num_containers < 0) {
192+
metadata.valid = false;
193+
return metadata;
194+
}
195+
if (metadata.num_containers > (1 << 16)) {
196+
metadata.valid = false;
197+
return metadata;
198+
}
199+
200+
metadata.has_run = (cookie & 0xFFFF) == serial_cookie;
201+
if (metadata.has_run) {
202+
metadata.valid = false;
203+
return metadata; // TODO run container bitmap is not supported yet
204+
cuda::std::size_t s = (metadata.num_containers + 7) / 8;
205+
metadata.run_container_bitmap = cuda::std::distance(bitmap, buf);
206+
buf += s;
207+
}
208+
209+
metadata.key_cards = cuda::std::distance(bitmap, buf);
210+
buf += metadata.num_containers * 2 * sizeof(cuda::std::uint16_t);
211+
212+
if ((!metadata.has_run) || (metadata.num_containers >= no_offset_threshold)) {
213+
metadata.container_offsets = cuda::std::distance(bitmap, buf);
214+
metadata.offsets_aligned =
215+
(reinterpret_cast<cuda::std::uintptr_t>(bitmap + metadata.container_offsets) %
216+
sizeof(cuda::std::uint32_t)) == 0;
217+
buf += metadata.num_containers * 4;
218+
}
219+
220+
metadata.num_keys = 0;
221+
cuda::std::uint16_t const* key_cards =
222+
reinterpret_cast<cuda::std::uint16_t const*>(bitmap + metadata.key_cards);
223+
cuda::std::uint32_t card = 0;
224+
for (cuda::std::int32_t i = 0; i < metadata.num_containers; i++) {
225+
// cuda::std::uint16_t key = key_cards[i * 2];
226+
card = key_cards[i * 2 + 1] + 1;
227+
metadata.num_keys += card;
228+
}
229+
230+
// find end of roaring bitmap
231+
cuda::std::byte const* end = bitmap + container_offset(bitmap + metadata.container_offsets,
232+
metadata.offsets_aligned,
233+
metadata.num_containers - 1);
234+
if (is_run_container(
235+
reinterpret_cast<cuda::std::uint8_t const*>(bitmap + metadata.run_container_bitmap),
236+
metadata.has_run,
237+
metadata.num_containers - 1)) {
238+
// TODO implement
239+
} else {
240+
if (card <= 4096) { // TODO check if this is correct
241+
end += card * sizeof(cuda::std::uint16_t);
242+
} else {
243+
end += 8192; // fixed size bitset container
244+
}
245+
}
246+
247+
metadata.size_bytes = static_cast<cuda::std::size_t>(cuda::std::distance(bitmap, end));
248+
metadata.valid = true;
249+
return metadata;
250+
}
251+
138252
private:
139-
__device__ bool is_run_container(cuda::std::int32_t i) const
253+
__host__ __device__ static bool is_run_container(cuda::std::uint8_t const* run_container_bitmap,
254+
bool has_run,
255+
cuda::std::int32_t i)
140256
{
141-
if (not has_run_) return false;
142-
return run_container_bitmap_[i / 8] & (1 << (i % 8));
257+
if (not has_run) return false;
258+
return run_container_bitmap[i / 8] & (1 << (i % 8));
143259
}
144260

145261
__device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
146262
{
147-
cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1;
148-
cuda::std::uint16_t const* container =
149-
reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(index));
150-
if (this->is_run_container(index)) {
263+
cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1;
264+
cuda::std::uint16_t const* container = reinterpret_cast<cuda::std::uint16_t const*>(
265+
data_.data() + container_offset(offsets_, offsets_aligned_, index));
266+
if (is_run_container(run_container_bitmap_, has_run_, index)) {
151267
return this->contains_run_container(container, lower, card);
152268
} else {
153269
if (card <= 4096) { // TODO check if this is correct
@@ -202,116 +318,26 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
202318
return false;
203319
}
204320

205-
__device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const
321+
__host__ __device__ static cuda::std::uint32_t container_offset(cuda::std::byte const* offsets,
322+
bool offsets_aligned,
323+
cuda::std::int32_t i)
206324
{
207-
cuda::std::uint32_t offset;
208-
if (offsets_aligned_) {
325+
cuda::std::uint32_t offset = 0;
326+
if (offsets_aligned) {
209327
offset =
210-
*reinterpret_cast<cuda::std::uint32_t const*>(offsets_ + i * sizeof(cuda::std::uint32_t));
328+
*reinterpret_cast<cuda::std::uint32_t const*>(offsets + i * sizeof(cuda::std::uint32_t));
211329
} else {
212330
cuda::std::memcpy(
213-
&offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
331+
&offset, offsets + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
214332
}
215333
return offset;
216334
}
217335

218-
__host__ __device__ bool read_header(cuda::std::span<cuda::std::byte const> compressed_bitmap)
219-
{
220-
cuda::std::size_t length = compressed_bitmap.size();
221-
cuda::std::byte const* buf = compressed_bitmap.data();
222-
[[maybe_unused]] cuda::std::size_t readbytes = 0;
223-
224-
// cookie and num_containers
225-
if (length < 4) {
226-
// printf("length is less than 4\n");
227-
return false;
228-
}
229-
230-
cuda::std::uint32_t cookie;
231-
cuda::std::memcpy(&cookie, buf, sizeof(cuda::std::uint32_t));
232-
readbytes += sizeof(cuda::std::uint32_t);
233-
buf += sizeof(cuda::std::uint32_t);
234-
if ((cookie & 0xFFFF) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
235-
// printf("cookie is not serial cookie or serial cookie no runcontainer\n");
236-
return false;
237-
}
238-
239-
if ((cookie & 0xFFFF) == serial_cookie)
240-
num_containers_ = (cookie >> 16) + 1;
241-
else {
242-
readbytes += sizeof(cuda::std::uint32_t);
243-
if (readbytes > length) {
244-
// printf("readbytes is greater than length\n");
245-
return false;
246-
}
247-
cuda::std::memcpy(&num_containers_, buf, sizeof(cuda::std::uint32_t));
248-
buf += sizeof(cuda::std::uint32_t);
249-
}
250-
if (num_containers_ < 0) {
251-
// printf("num_containers_ is less than 0\n");
252-
return false;
253-
}
254-
if (num_containers_ > (1 << 16)) {
255-
// printf("num_containers_ is greater than 65536\n");
256-
return false;
257-
}
258-
// printf("num_containers_: %d\n", num_containers_);
259-
260-
has_run_ = (cookie & 0xFFFF) == serial_cookie;
261-
if (has_run_) {
262-
cuda::std::size_t s = (num_containers_ + 7) / 8;
263-
readbytes += s;
264-
if (readbytes > length) {
265-
// printf("readbytes is greater than length\n");
266-
return false;
267-
}
268-
run_container_bitmap_ = reinterpret_cast<cuda::std::uint8_t const*>(buf);
269-
buf += s;
270-
}
271-
// printf("has_run: %d\n", has_run_);
272-
273-
key_cards_ = reinterpret_cast<cuda::std::uint16_t const*>(buf);
274-
readbytes += num_containers_ * 2 * sizeof(cuda::std::uint16_t);
275-
if (readbytes > length) {
276-
// printf("readbytes is greater than length\n");
277-
return false;
278-
}
279-
buf += num_containers_ * 2 * sizeof(cuda::std::uint16_t);
280-
281-
if ((!has_run_) || (num_containers_ >= no_offset_threshold)) {
282-
readbytes += num_containers_ * 4;
283-
if (readbytes > length) {
284-
// printf("readbytes is greater than length\n");
285-
return false;
286-
}
287-
offsets_ = buf;
288-
offsets_aligned_ =
289-
(reinterpret_cast<cuda::std::uintptr_t>(offsets_) % sizeof(cuda::std::uint32_t)) == 0;
290-
buf += num_containers_ * 4;
291-
}
292-
293-
readbytes += num_containers_ * 4;
294-
if (readbytes > length) {
295-
// printf("readbytes is greater than length\n");
296-
return false;
297-
}
298-
299-
size_ = 0;
300-
for (cuda::std::int32_t i = 0; i < num_containers_; i++) {
301-
// cuda::std::uint16_t key = key_cards_[i * 2];
302-
cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1;
303-
size_ += card;
304-
// printf("key: %d, card: %d\n", key, card);
305-
}
306-
307-
return true;
308-
}
309-
310336
cuda::std::span<cuda::std::byte const> data_;
311337
cuda::std::size_t size_;
312338
cuda::std::int32_t num_containers_;
313339
cuda::std::uint8_t const* run_container_bitmap_;
314-
cuda::std::uint16_t const* key_cards_;
340+
cuda::std::uint16_t const* key_cards_; // TODO uint8?
315341
cuda::std::byte const* offsets_;
316342
bool offsets_aligned_;
317343
bool has_run_;

0 commit comments

Comments
 (0)