3030#include < thrust/fill.h>
3131#include < thrust/transform.h>
3232
33+ #include < nv/target>
34+
3335namespace cuco ::detail {
3436
37+ template <class T >
38+ struct roaring_bitmap_metadata {
39+ static_assert (cuco::dependent_false<T>, " T must be either uint32_t or uint64_t" );
40+ };
41+
42+ template <>
43+ struct roaring_bitmap_metadata <cuda::std::uint32_t > {
44+ cuda::std::size_t size_bytes = 0 ;
45+ cuda::std::size_t num_keys = 0 ;
46+ cuda::std::size_t run_container_bitmap = 0 ;
47+ cuda::std::size_t key_cards = 0 ;
48+ cuda::std::size_t container_offsets = 0 ;
49+ cuda::std::int32_t num_containers = 0 ;
50+ bool has_run = false ;
51+ bool offsets_aligned = false ;
52+ bool valid = false ;
53+ };
54+
55+ // TODO implement roaring_bitmap_metadata<cuda::std::uint64_t>
56+
3557// primary template
3658template <class T , cuda::thread_scope Scope>
3759class roaring_bitmap_impl {
@@ -48,22 +70,33 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
4870 static constexpr cuda::std::uint32_t binary_search_threshold = 8 ; // TODO determine optimal value
4971
5072 public:
73+ using metadata_type = roaring_bitmap_metadata<cuda::std::uint32_t >;
5174 static constexpr auto thread_scope = Scope;
5275
53- __host__ roaring_bitmap_impl (cuda::std::span<cuda::std::byte const > compressed_bitmap_h,
54- cuda::std::span<cuda::std::byte const > compressed_bitmap_d,
55- cuda_thread_scope<Scope> /* scope */ )
56- : data_{compressed_bitmap_d}
76+ __host__ __device__ roaring_bitmap_impl (cuda::std::byte const * bitmap,
77+ metadata_type metadata,
78+ cuda_thread_scope<Scope> /* scope */ )
5779 {
58- bool success = this ->read_header (compressed_bitmap_h);
59- CUCO_EXPECTS (success, " Failed to read compressed bitmap" );
80+ NV_IF_TARGET (
81+ NV_IS_HOST,
82+ CUCO_EXPECTS (metadata.valid , " Invalid bitmap format" );) // TODO device error handling
83+
84+ if (metadata.valid ) {
85+ data_ = cuda::std::span<cuda::std::byte const >{bitmap, metadata.size_bytes };
86+ size_ = metadata.num_keys ;
87+ num_containers_ = metadata.num_containers ;
88+ run_container_bitmap_ =
89+ reinterpret_cast <cuda::std::uint8_t const *>(bitmap + metadata.run_container_bitmap );
90+ key_cards_ = reinterpret_cast <cuda::std::uint16_t const *>(bitmap + metadata.key_cards );
91+ offsets_ = reinterpret_cast <cuda::std::byte const *>(bitmap + metadata.container_offsets );
92+ offsets_aligned_ = metadata.offsets_aligned ;
93+ has_run_ = metadata.has_run ;
94+ }
6095 }
6196
62- __device__ roaring_bitmap_impl (cuda::std::span<cuda::std::byte const > compressed_bitmap,
63- cuda_thread_scope<Scope> /* scope */ )
64- : data_{compressed_bitmap}
97+ __device__ roaring_bitmap_impl (cuda::std::byte const * bitmap, cuda_thread_scope<Scope> scope)
98+ : roaring_bitmap_impl(bitmap, read_metadata(bitmap), scope)
6599 {
66- this ->read_header (compressed_bitmap); // TODO error handling
67100 }
68101
69102 template <class InputIt , class OutputIt >
@@ -135,19 +168,102 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
135168 return data_;
136169 }
137170
171+ __host__ __device__ static metadata_type const read_metadata (
172+ cuda::std::byte const * bitmap) noexcept
173+ {
174+ cuda::std::byte const * buf = bitmap;
175+ metadata_type metadata;
176+
177+ cuda::std::uint32_t cookie;
178+ cuda::std::memcpy (&cookie, buf, sizeof (cuda::std::uint32_t ));
179+ buf += sizeof (cuda::std::uint32_t );
180+ if ((cookie & 0xFFFF ) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
181+ metadata.valid = false ;
182+ return metadata;
183+ }
184+
185+ if ((cookie & 0xFFFF ) == serial_cookie)
186+ metadata.num_containers = (cookie >> 16 ) + 1 ;
187+ else {
188+ cuda::std::memcpy (&metadata.num_containers , buf, sizeof (cuda::std::uint32_t ));
189+ buf += sizeof (cuda::std::uint32_t );
190+ }
191+ if (metadata.num_containers < 0 ) {
192+ metadata.valid = false ;
193+ return metadata;
194+ }
195+ if (metadata.num_containers > (1 << 16 )) {
196+ metadata.valid = false ;
197+ return metadata;
198+ }
199+
200+ metadata.has_run = (cookie & 0xFFFF ) == serial_cookie;
201+ if (metadata.has_run ) {
202+ metadata.valid = false ;
203+ return metadata; // TODO run container bitmap is not supported yet
204+ cuda::std::size_t s = (metadata.num_containers + 7 ) / 8 ;
205+ metadata.run_container_bitmap = cuda::std::distance (bitmap, buf);
206+ buf += s;
207+ }
208+
209+ metadata.key_cards = cuda::std::distance (bitmap, buf);
210+ buf += metadata.num_containers * 2 * sizeof (cuda::std::uint16_t );
211+
212+ if ((!metadata.has_run ) || (metadata.num_containers >= no_offset_threshold)) {
213+ metadata.container_offsets = cuda::std::distance (bitmap, buf);
214+ metadata.offsets_aligned =
215+ (reinterpret_cast <cuda::std::uintptr_t >(bitmap + metadata.container_offsets ) %
216+ sizeof (cuda::std::uint32_t )) == 0 ;
217+ buf += metadata.num_containers * 4 ;
218+ }
219+
220+ metadata.num_keys = 0 ;
221+ cuda::std::uint16_t const * key_cards =
222+ reinterpret_cast <cuda::std::uint16_t const *>(bitmap + metadata.key_cards );
223+ cuda::std::uint32_t card = 0 ;
224+ for (cuda::std::int32_t i = 0 ; i < metadata.num_containers ; i++) {
225+ // cuda::std::uint16_t key = key_cards[i * 2];
226+ card = key_cards[i * 2 + 1 ] + 1 ;
227+ metadata.num_keys += card;
228+ }
229+
230+ // find end of roaring bitmap
231+ cuda::std::byte const * end = bitmap + container_offset (bitmap + metadata.container_offsets ,
232+ metadata.offsets_aligned ,
233+ metadata.num_containers - 1 );
234+ if (is_run_container (
235+ reinterpret_cast <cuda::std::uint8_t const *>(bitmap + metadata.run_container_bitmap ),
236+ metadata.has_run ,
237+ metadata.num_containers - 1 )) {
238+ // TODO implement
239+ } else {
240+ if (card <= 4096 ) { // TODO check if this is correct
241+ end += card * sizeof (cuda::std::uint16_t );
242+ } else {
243+ end += 8192 ; // fixed size bitset container
244+ }
245+ }
246+
247+ metadata.size_bytes = static_cast <cuda::std::size_t >(cuda::std::distance (bitmap, end));
248+ metadata.valid = true ;
249+ return metadata;
250+ }
251+
138252 private:
139- __device__ bool is_run_container (cuda::std::int32_t i) const
253+ __host__ __device__ static bool is_run_container (cuda::std::uint8_t const * run_container_bitmap,
254+ bool has_run,
255+ cuda::std::int32_t i)
140256 {
141- if (not has_run_ ) return false ;
142- return run_container_bitmap_ [i / 8 ] & (1 << (i % 8 ));
257+ if (not has_run ) return false ;
258+ return run_container_bitmap [i / 8 ] & (1 << (i % 8 ));
143259 }
144260
145261 __device__ bool contains_container (cuda::std::uint16_t lower, cuda::std::uint32_t index) const
146262 {
147- cuda::std::uint32_t card = key_cards_[index * 2 + 1 ] + 1 ;
148- cuda::std::uint16_t const * container =
149- reinterpret_cast <cuda::std:: uint16_t const *>( data_.data () + this -> container_offset (index));
150- if (this -> is_run_container (index)) {
263+ cuda::std::uint32_t card = key_cards_[index * 2 + 1 ] + 1 ;
264+ cuda::std::uint16_t const * container = reinterpret_cast <cuda::std:: uint16_t const *>(
265+ data_.data () + container_offset (offsets_, offsets_aligned_, index));
266+ if (is_run_container (run_container_bitmap_, has_run_, index)) {
151267 return this ->contains_run_container (container, lower, card);
152268 } else {
153269 if (card <= 4096 ) { // TODO check if this is correct
@@ -202,116 +318,26 @@ class roaring_bitmap_impl<cuda::std::uint32_t, Scope> {
202318 return false ;
203319 }
204320
205- __device__ cuda::std::uint32_t container_offset (cuda::std::int32_t i) const
321+ __host__ __device__ static cuda::std::uint32_t container_offset (cuda::std::byte const * offsets,
322+ bool offsets_aligned,
323+ cuda::std::int32_t i)
206324 {
207- cuda::std::uint32_t offset;
208- if (offsets_aligned_ ) {
325+ cuda::std::uint32_t offset = 0 ;
326+ if (offsets_aligned ) {
209327 offset =
210- *reinterpret_cast <cuda::std::uint32_t const *>(offsets_ + i * sizeof (cuda::std::uint32_t ));
328+ *reinterpret_cast <cuda::std::uint32_t const *>(offsets + i * sizeof (cuda::std::uint32_t ));
211329 } else {
212330 cuda::std::memcpy (
213- &offset, offsets_ + i * sizeof (cuda::std::uint32_t ), sizeof (cuda::std::uint32_t ));
331+ &offset, offsets + i * sizeof (cuda::std::uint32_t ), sizeof (cuda::std::uint32_t ));
214332 }
215333 return offset;
216334 }
217335
218- __host__ __device__ bool read_header (cuda::std::span<cuda::std::byte const > compressed_bitmap)
219- {
220- cuda::std::size_t length = compressed_bitmap.size ();
221- cuda::std::byte const * buf = compressed_bitmap.data ();
222- [[maybe_unused]] cuda::std::size_t readbytes = 0 ;
223-
224- // cookie and num_containers
225- if (length < 4 ) {
226- // printf("length is less than 4\n");
227- return false ;
228- }
229-
230- cuda::std::uint32_t cookie;
231- cuda::std::memcpy (&cookie, buf, sizeof (cuda::std::uint32_t ));
232- readbytes += sizeof (cuda::std::uint32_t );
233- buf += sizeof (cuda::std::uint32_t );
234- if ((cookie & 0xFFFF ) != serial_cookie && cookie != serial_cookie_no_runcontainer) {
235- // printf("cookie is not serial cookie or serial cookie no runcontainer\n");
236- return false ;
237- }
238-
239- if ((cookie & 0xFFFF ) == serial_cookie)
240- num_containers_ = (cookie >> 16 ) + 1 ;
241- else {
242- readbytes += sizeof (cuda::std::uint32_t );
243- if (readbytes > length) {
244- // printf("readbytes is greater than length\n");
245- return false ;
246- }
247- cuda::std::memcpy (&num_containers_, buf, sizeof (cuda::std::uint32_t ));
248- buf += sizeof (cuda::std::uint32_t );
249- }
250- if (num_containers_ < 0 ) {
251- // printf("num_containers_ is less than 0\n");
252- return false ;
253- }
254- if (num_containers_ > (1 << 16 )) {
255- // printf("num_containers_ is greater than 65536\n");
256- return false ;
257- }
258- // printf("num_containers_: %d\n", num_containers_);
259-
260- has_run_ = (cookie & 0xFFFF ) == serial_cookie;
261- if (has_run_) {
262- cuda::std::size_t s = (num_containers_ + 7 ) / 8 ;
263- readbytes += s;
264- if (readbytes > length) {
265- // printf("readbytes is greater than length\n");
266- return false ;
267- }
268- run_container_bitmap_ = reinterpret_cast <cuda::std::uint8_t const *>(buf);
269- buf += s;
270- }
271- // printf("has_run: %d\n", has_run_);
272-
273- key_cards_ = reinterpret_cast <cuda::std::uint16_t const *>(buf);
274- readbytes += num_containers_ * 2 * sizeof (cuda::std::uint16_t );
275- if (readbytes > length) {
276- // printf("readbytes is greater than length\n");
277- return false ;
278- }
279- buf += num_containers_ * 2 * sizeof (cuda::std::uint16_t );
280-
281- if ((!has_run_) || (num_containers_ >= no_offset_threshold)) {
282- readbytes += num_containers_ * 4 ;
283- if (readbytes > length) {
284- // printf("readbytes is greater than length\n");
285- return false ;
286- }
287- offsets_ = buf;
288- offsets_aligned_ =
289- (reinterpret_cast <cuda::std::uintptr_t >(offsets_) % sizeof (cuda::std::uint32_t )) == 0 ;
290- buf += num_containers_ * 4 ;
291- }
292-
293- readbytes += num_containers_ * 4 ;
294- if (readbytes > length) {
295- // printf("readbytes is greater than length\n");
296- return false ;
297- }
298-
299- size_ = 0 ;
300- for (cuda::std::int32_t i = 0 ; i < num_containers_; i++) {
301- // cuda::std::uint16_t key = key_cards_[i * 2];
302- cuda::std::uint32_t card = key_cards_[i * 2 + 1 ] + 1 ;
303- size_ += card;
304- // printf("key: %d, card: %d\n", key, card);
305- }
306-
307- return true ;
308- }
309-
310336 cuda::std::span<cuda::std::byte const > data_;
311337 cuda::std::size_t size_;
312338 cuda::std::int32_t num_containers_;
313339 cuda::std::uint8_t const * run_container_bitmap_;
314- cuda::std::uint16_t const * key_cards_;
340+ cuda::std::uint16_t const * key_cards_; // TODO uint8?
315341 cuda::std::byte const * offsets_;
316342 bool offsets_aligned_;
317343 bool has_run_;
0 commit comments