1313 * See the License for the specific language governing permissions and
1414 * limitations under the License.
1515 */
16+
1617#pragma once
1718
1819#include < cuco/detail/error.hpp>
@@ -44,6 +45,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
4445 static constexpr cuda::std::uint32_t serial_cookie = 12347 ;
4546 static constexpr cuda::std::uint32_t frozen_cookie = 13766 ;
4647 static constexpr cuda::std::int32_t no_offset_threshold = 4 ;
48+ static constexpr cuda::std::uint32_t binary_search_threshold = 8 ; // TODO determine optimal value
4749
4850 public:
4951 static constexpr auto thread_scope = Scope;
@@ -99,20 +101,25 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
99101 cuda::std::uint16_t upper = value >> 16 ;
100102 cuda::std::uint16_t lower = value & 0xFFFF ;
101103
102- // TODO binary search on key_cards_
103- for (cuda::std::int32_t i = 0 ; i < num_containers_; i++) {
104- if (key_cards_[i * 2 ] == upper) {
105- cuda::std::uint32_t card = key_cards_[i * 2 + 1 ] + 1 ;
106- cuda::std::uint16_t const * container =
107- reinterpret_cast <cuda::std::uint16_t const *>(data_.data () + this ->container_offset (i));
108- if (this ->is_run_container (i)) {
109- return this ->contains_run_container (container, lower, card);
104+ // Binary search on key_cards_ to find container with matching upper key
105+ cuda::std::uint32_t left = 0 ;
106+ cuda::std::uint32_t right = num_containers_;
107+
108+ if (num_containers_ < binary_search_threshold) {
109+ for (cuda::std::uint32_t i = 0 ; i < num_containers_; i++) {
110+ if (key_cards_[i * 2 ] == upper) { return this ->contains_container (lower, i); }
111+ }
112+ } else {
113+ while (left < right) {
114+ cuda::std::uint32_t mid = left + (right - left) / 2 ;
115+ cuda::std::uint16_t mid_key = key_cards_[mid * 2 ];
116+
117+ if (mid_key == upper) {
118+ return this ->contains_container (lower, mid);
119+ } else if (mid_key < upper) {
120+ left = mid + 1 ;
110121 } else {
111- if (card <= 4096 ) { // TODO check if this is correct
112- return this ->contains_array_container (container, lower, card);
113- } else {
114- return this ->contains_bitset_container (container, lower, card);
115- }
122+ right = mid;
116123 }
117124 }
118125 }
@@ -135,16 +142,48 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
135142 return run_container_bitmap_[i / 8 ] & (1 << (i % 8 ));
136143 }
137144
145+ __device__ bool contains_container (cuda::std::uint16_t lower, cuda::std::uint32_t index) const
146+ {
147+ cuda::std::uint32_t card = key_cards_[index * 2 + 1 ] + 1 ;
148+ cuda::std::uint16_t const * container =
149+ reinterpret_cast <cuda::std::uint16_t const *>(data_.data () + this ->container_offset (index));
150+ if (this ->is_run_container (index)) {
151+ return this ->contains_run_container (container, lower, card);
152+ } else {
153+ if (card <= 4096 ) { // TODO check if this is correct
154+ return this ->contains_array_container (container, lower, card);
155+ } else {
156+ return this ->contains_bitset_container (container, lower, card);
157+ }
158+ }
159+ }
160+
138161 __device__ bool contains_array_container (cuda::std::uint16_t const * container,
139162 cuda::std::uint16_t lower,
140163 cuda::std::uint32_t card) const
141164 {
142- // TODO binary search on container
143- // if (card < 256) -> linear search
144- for (cuda::std::uint32_t i = 0 ; i < card; i++) {
145- if (container[i] == lower) { return true ; }
165+ // Use linear search for small arrays, binary search for larger ones
166+ if (card < binary_search_threshold) {
167+ for (cuda::std::uint32_t i = 0 ; i < card; i++) {
168+ if (container[i] == lower) { return true ; }
169+ }
170+ return false ;
171+ } else {
172+ cuda::std::uint32_t left = 0 ;
173+ cuda::std::uint32_t right = card;
174+
175+ while (left < right) {
176+ cuda::std::uint32_t mid = left + (right - left) / 2 ;
177+ if (container[mid] == lower) {
178+ return true ;
179+ } else if (container[mid] < lower) {
180+ left = mid + 1 ;
181+ } else {
182+ right = mid;
183+ }
184+ }
185+ return false ;
146186 }
147- return false ;
148187 }
149188
150189 __device__ bool contains_bitset_container (cuda::std::uint16_t const * container,
@@ -166,8 +205,13 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
166205 __device__ cuda::std::uint32_t container_offset (cuda::std::int32_t i) const
167206 {
168207 cuda::std::uint32_t offset;
169- cuda::std::memcpy (
170- &offset, offsets_ + i * sizeof (cuda::std::uint32_t ), sizeof (cuda::std::uint32_t ));
208+ if (offsets_aligned_) {
209+ offset =
210+ *reinterpret_cast <cuda::std::uint32_t const *>(offsets_ + i * sizeof (cuda::std::uint32_t ));
211+ } else {
212+ cuda::std::memcpy (
213+ &offset, offsets_ + i * sizeof (cuda::std::uint32_t ), sizeof (cuda::std::uint32_t ));
214+ }
171215 return offset;
172216 }
173217
@@ -241,6 +285,8 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
241285 return false ;
242286 }
243287 offsets_ = buf;
288+ offsets_aligned_ =
289+ (reinterpret_cast <cuda::std::uintptr_t >(offsets_) % sizeof (cuda::std::uint32_t )) == 0 ;
244290 buf += num_containers_ * 4 ;
245291 }
246292
@@ -267,6 +313,7 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
267313 cuda::std::uint8_t const * run_container_bitmap_;
268314 cuda::std::uint16_t const * key_cards_;
269315 cuda::std::byte const * offsets_;
316+ bool offsets_aligned_;
270317 bool has_run_;
271318};
272319
0 commit comments