Skip to content

Commit 77a4c1d

Browse files
committed
Optimizations
1 parent 7ff8399 commit 77a4c1d

File tree

2 files changed

+69
-24
lines changed

2 files changed

+69
-24
lines changed

benchmarks/roaring_bitmap/contains_bench.cu

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
1617
#include <benchmark_defaults.hpp>
1718
#include <benchmark_utils.hpp>
1819

@@ -38,7 +39,6 @@ void roaring_bitmap_contains(nvbench::state& state)
3839
fs::path path = source_dir / "../../examples/roaring_bitmap/bitmapwithoutruns.bin";
3940
fs::path full_path = path.lexically_normal();
4041

41-
// Open file
4242
std::ifstream file(full_path, std::ios::binary);
4343
if (!file.is_open()) { state.skip("Failed to open bitmap file"); }
4444

@@ -47,11 +47,9 @@ void roaring_bitmap_contains(nvbench::state& state)
4747
std::streamsize file_size = file.tellg();
4848
file.seekg(0, std::ios::beg);
4949

50-
// Allocate pinned host memory using cudaMallocHost
5150
char* buffer;
5251
CUCO_CUDA_TRY(cudaMallocHost(&buffer, file_size));
5352

54-
// Read file into memory
5553
file.read(buffer, file_size);
5654
file.close();
5755

@@ -70,7 +68,7 @@ void roaring_bitmap_contains(nvbench::state& state)
7068
keys.push_back(k);
7169
}
7270

73-
// multiply the keys for the benchmark
71+
// multiply the keys for more accurate benchmark numbers
7472
for (int i = 0; i < 13; i++) {
7573
keys.insert(keys.end(), keys.begin(), keys.end());
7674
}

include/cuco/detail/roaring_bitmap/roaring_bitmap_impl.cuh

Lines changed: 67 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
1617
#pragma once
1718

1819
#include <cuco/detail/error.hpp>
@@ -44,6 +45,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
4445
static constexpr cuda::std::uint32_t serial_cookie = 12347;
4546
static constexpr cuda::std::uint32_t frozen_cookie = 13766;
4647
static constexpr cuda::std::int32_t no_offset_threshold = 4;
48+
static constexpr cuda::std::uint32_t binary_search_threshold = 8; // TODO determine optimal value
4749

4850
public:
4951
static constexpr auto thread_scope = Scope;
@@ -99,20 +101,25 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
99101
cuda::std::uint16_t upper = value >> 16;
100102
cuda::std::uint16_t lower = value & 0xFFFF;
101103

102-
// TODO binary search on key_cards_
103-
for (cuda::std::int32_t i = 0; i < num_containers_; i++) {
104-
if (key_cards_[i * 2] == upper) {
105-
cuda::std::uint32_t card = key_cards_[i * 2 + 1] + 1;
106-
cuda::std::uint16_t const* container =
107-
reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(i));
108-
if (this->is_run_container(i)) {
109-
return this->contains_run_container(container, lower, card);
104+
// Binary search on key_cards_ to find container with matching upper key
105+
cuda::std::uint32_t left = 0;
106+
cuda::std::uint32_t right = num_containers_;
107+
108+
if (num_containers_ < binary_search_threshold) {
109+
for (cuda::std::uint32_t i = 0; i < num_containers_; i++) {
110+
if (key_cards_[i * 2] == upper) { return this->contains_container(lower, i); }
111+
}
112+
} else {
113+
while (left < right) {
114+
cuda::std::uint32_t mid = left + (right - left) / 2;
115+
cuda::std::uint16_t mid_key = key_cards_[mid * 2];
116+
117+
if (mid_key == upper) {
118+
return this->contains_container(lower, mid);
119+
} else if (mid_key < upper) {
120+
left = mid + 1;
110121
} else {
111-
if (card <= 4096) { // TODO check if this is correct
112-
return this->contains_array_container(container, lower, card);
113-
} else {
114-
return this->contains_bitset_container(container, lower, card);
115-
}
122+
right = mid;
116123
}
117124
}
118125
}
@@ -135,16 +142,48 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
135142
return run_container_bitmap_[i / 8] & (1 << (i % 8));
136143
}
137144

145+
__device__ bool contains_container(cuda::std::uint16_t lower, cuda::std::uint32_t index) const
146+
{
147+
cuda::std::uint32_t card = key_cards_[index * 2 + 1] + 1;
148+
cuda::std::uint16_t const* container =
149+
reinterpret_cast<cuda::std::uint16_t const*>(data_.data() + this->container_offset(index));
150+
if (this->is_run_container(index)) {
151+
return this->contains_run_container(container, lower, card);
152+
} else {
153+
if (card <= 4096) { // TODO check if this is correct
154+
return this->contains_array_container(container, lower, card);
155+
} else {
156+
return this->contains_bitset_container(container, lower, card);
157+
}
158+
}
159+
}
160+
138161
__device__ bool contains_array_container(cuda::std::uint16_t const* container,
139162
cuda::std::uint16_t lower,
140163
cuda::std::uint32_t card) const
141164
{
142-
// TODO binary search on container
143-
// if (card < 256) -> linear search
144-
for (cuda::std::uint32_t i = 0; i < card; i++) {
145-
if (container[i] == lower) { return true; }
165+
// Use linear search for small arrays, binary search for larger ones
166+
if (card < binary_search_threshold) {
167+
for (cuda::std::uint32_t i = 0; i < card; i++) {
168+
if (container[i] == lower) { return true; }
169+
}
170+
return false;
171+
} else {
172+
cuda::std::uint32_t left = 0;
173+
cuda::std::uint32_t right = card;
174+
175+
while (left < right) {
176+
cuda::std::uint32_t mid = left + (right - left) / 2;
177+
if (container[mid] == lower) {
178+
return true;
179+
} else if (container[mid] < lower) {
180+
left = mid + 1;
181+
} else {
182+
right = mid;
183+
}
184+
}
185+
return false;
146186
}
147-
return false;
148187
}
149188

150189
__device__ bool contains_bitset_container(cuda::std::uint16_t const* container,
@@ -166,8 +205,13 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
166205
__device__ cuda::std::uint32_t container_offset(cuda::std::int32_t i) const
167206
{
168207
cuda::std::uint32_t offset;
169-
cuda::std::memcpy(
170-
&offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
208+
if (offsets_aligned_) {
209+
offset =
210+
*reinterpret_cast<cuda::std::uint32_t const*>(offsets_ + i * sizeof(cuda::std::uint32_t));
211+
} else {
212+
cuda::std::memcpy(
213+
&offset, offsets_ + i * sizeof(cuda::std::uint32_t), sizeof(cuda::std::uint32_t));
214+
}
171215
return offset;
172216
}
173217

@@ -241,6 +285,8 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
241285
return false;
242286
}
243287
offsets_ = buf;
288+
offsets_aligned_ =
289+
(reinterpret_cast<cuda::std::uintptr_t>(offsets_) % sizeof(cuda::std::uint32_t)) == 0;
244290
buf += num_containers_ * 4;
245291
}
246292

@@ -267,6 +313,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
267313
cuda::std::uint8_t const* run_container_bitmap_;
268314
cuda::std::uint16_t const* key_cards_;
269315
cuda::std::byte const* offsets_;
316+
bool offsets_aligned_;
270317
bool has_run_;
271318
};
272319

0 commit comments

Comments
 (0)