|
| 1 | +// Licensed to the Apache Software Foundation (ASF) under one |
| 2 | +// or more contributor license agreements. See the NOTICE file |
| 3 | +// distributed with this work for additional information |
| 4 | +// regarding copyright ownership. The ASF licenses this file |
| 5 | +// to you under the Apache License, Version 2.0 (the |
| 6 | +// "License"); you may not use this file except in compliance |
| 7 | +// with the License. You may obtain a copy of the License at |
| 8 | +// |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// |
| 11 | +// Unless required by applicable law or agreed to in writing, |
| 12 | +// software distributed under the License is distributed on an |
| 13 | +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| 14 | +// KIND, either express or implied. See the License for the |
| 15 | +// specific language governing permissions and limitations |
| 16 | +// under the License. |
| 17 | + |
| 18 | +#pragma once |
| 19 | + |
| 20 | +#include <atomic> |
| 21 | +#include <cstdint> |
| 22 | +#include <vector> |
| 23 | + |
| 24 | +#include "arrow/type_fwd.h" |
| 25 | +#include "arrow/util/macros.h" |
| 26 | + |
| 27 | +namespace arrow { |
| 28 | +namespace internal { |
| 29 | + |
| 30 | +struct ChunkLocation { |
| 31 | + int64_t chunk_index, index_in_chunk; |
| 32 | +}; |
| 33 | + |
| 34 | +// An object that resolves an array chunk depending on a logical index |
| 35 | +struct ChunkResolver { |
| 36 | + explicit ChunkResolver(const ArrayVector& chunks); |
| 37 | + |
| 38 | + explicit ChunkResolver(const std::vector<const Array*>& chunks); |
| 39 | + |
| 40 | + explicit ChunkResolver(const RecordBatchVector& batches); |
| 41 | + |
| 42 | + ChunkResolver(ChunkResolver&& other) |
| 43 | + : offsets_(std::move(other.offsets_)), cached_chunk_(other.cached_chunk_.load()) {} |
| 44 | + |
| 45 | + ChunkResolver& operator=(ChunkResolver&& other) { |
| 46 | + offsets_ = std::move(other.offsets_); |
| 47 | + cached_chunk_.store(other.cached_chunk_.load()); |
| 48 | + return *this; |
| 49 | + } |
| 50 | + |
| 51 | + /// \brief Return a ChunkLocation containing the chunk index and in-chunk value index of |
| 52 | + /// the chunked array at logical index |
| 53 | + inline ChunkLocation Resolve(const int64_t index) const { |
| 54 | + // It is common for the algorithms below to make consecutive accesses at |
| 55 | + // a relatively small distance from each other, hence often falling in |
| 56 | + // the same chunk. |
| 57 | + // This is trivial when merging (assuming each side of the merge uses |
| 58 | + // its own resolver), but also in the inner recursive invocations of |
| 59 | + // partitioning. |
| 60 | + if (offsets_.size() <= 1) { |
| 61 | + return {0, index}; |
| 62 | + } |
| 63 | + const auto cached_chunk = cached_chunk_.load(); |
| 64 | + const bool cache_hit = |
| 65 | + (index >= offsets_[cached_chunk] && index < offsets_[cached_chunk + 1]); |
| 66 | + if (ARROW_PREDICT_TRUE(cache_hit)) { |
| 67 | + return {cached_chunk, index - offsets_[cached_chunk]}; |
| 68 | + } |
| 69 | + auto chunk_index = Bisect(index); |
| 70 | + cached_chunk_.store(chunk_index); |
| 71 | + return {chunk_index, index - offsets_[chunk_index]}; |
| 72 | + } |
| 73 | + |
| 74 | + protected: |
| 75 | + // Find the chunk index corresponding to a value index using binary search |
| 76 | + inline int64_t Bisect(const int64_t index) const { |
| 77 | + // Like std::upper_bound(), but hand-written as it can help the compiler. |
| 78 | + // Search [lo, lo + n) |
| 79 | + int64_t lo = 0; |
| 80 | + auto n = static_cast<int64_t>(offsets_.size()); |
| 81 | + while (n > 1) { |
| 82 | + const int64_t m = n >> 1; |
| 83 | + const int64_t mid = lo + m; |
| 84 | + if (static_cast<int64_t>(index) >= offsets_[mid]) { |
| 85 | + lo = mid; |
| 86 | + n -= m; |
| 87 | + } else { |
| 88 | + n = m; |
| 89 | + } |
| 90 | + } |
| 91 | + return lo; |
| 92 | + } |
| 93 | + |
| 94 | + private: |
| 95 | + // Collection of starting offsets used for binary search |
| 96 | + std::vector<int64_t> offsets_; |
| 97 | + |
| 98 | + // Tracks the most recently used chunk index to allow fast |
| 99 | + // access for consecutive indices corresponding to the same chunk |
| 100 | + mutable std::atomic<int64_t> cached_chunk_; |
| 101 | +}; |
| 102 | + |
| 103 | +} // namespace internal |
| 104 | +} // namespace arrow |
0 commit comments