Skip to content
This repository was archived by the owner on Jan 16, 2024. It is now read-only.

Commit 4f8322f

Browse files
committed
Bundle 8.0.0-1 (2022-05-08)
1 parent b0ade85 commit 4f8322f

File tree

119 files changed

+6522
-578
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

119 files changed

+6522
-578
lines changed

README.md

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,32 @@
1-
# Apache Arrow 7.0.0
2-
3-
Combined bundle with builds for rtools40 [mingw-w64-arrow](https://github.com/r-windows/rtools-packages/blob/master/mingw-w64-arrow/PKGBUILD) and [backports](https://github.com/r-windows/rtools-backports/blob/master/mingw-w64-arrow/PKGBUILD) for the R legacy toolchain in [lib-4.9.3](lib-4.9.3)
4-
5-
Now supports parquet (thrift) and snappy. Example flags to compile and link the R bindings:
6-
7-
```
8-
PKG_CPPFLAGS = -I$(ARROW_INCLUDE) \
9-
-DARROW_R_WITH_ARROW -DARROW_DS_STATIC -DARROW_STATIC -DPARQUET_STATIC
10-
11-
CXX_STD = CXX11
12-
13-
PKG_LIBS = \
14-
-L$(ARROW_LIBS) \
15-
-lparquet -larrow_dataset -larrow \
16-
-lthrift -lsnappy -lz -lzstd -llz4 -lcrypto -lcrypt32
17-
```
18-
19-
To test this make sure you install the arrow package from a release tag:
20-
21-
```r
22-
remotes::install_github("apache/arrow/[email protected]")
23-
```
24-
25-
To install R package from the arrow master branch you also would need to rebuild the master branch arrow C++ library from source.
1+
# arrow 8.0.0-1
2+
3+
- mingw-w64-i686-arrow-8.0.0-1-any.pkg.tar.xz
4+
- mingw-w64-i686-aws-sdk-cpp-1.7.365-1-any.pkg.tar.xz
5+
- mingw-w64-i686-brotli-1.0.9-4-any.pkg.tar.xz
6+
- mingw-w64-i686-openssl-1.1.1.m-9800-any.pkg.tar.xz
7+
- mingw-w64-i686-lz4-1.8.2-1-any.pkg.tar.xz
8+
- mingw-w64-i686-re2-20200801-1-any.pkg.tar.xz
9+
- mingw-w64-i686-snappy-1.1.7-2-any.pkg.tar.xz
10+
- mingw-w64-i686-thrift-0.13.0-1-any.pkg.tar.xz
11+
- mingw-w64-i686-zstd-1.4.4-1-any.pkg.tar.xz
12+
- mingw-w64-i686-libutf8proc-2.4.0-2-any.pkg.tar.xz
13+
- mingw-w64-x86_64-arrow-8.0.0-1-any.pkg.tar.xz
14+
- mingw-w64-x86_64-aws-sdk-cpp-1.7.365-1-any.pkg.tar.xz
15+
- mingw-w64-x86_64-brotli-1.0.9-4-any.pkg.tar.xz
16+
- mingw-w64-x86_64-openssl-1.1.1.m-9800-any.pkg.tar.xz
17+
- mingw-w64-x86_64-lz4-1.8.2-1-any.pkg.tar.xz
18+
- mingw-w64-x86_64-re2-20200801-1-any.pkg.tar.xz
19+
- mingw-w64-x86_64-snappy-1.1.7-2-any.pkg.tar.xz
20+
- mingw-w64-x86_64-thrift-0.13.0-1-any.pkg.tar.xz
21+
- mingw-w64-x86_64-zstd-1.4.4-1-any.pkg.tar.xz
22+
- mingw-w64-x86_64-libutf8proc-2.4.0-2-any.pkg.tar.xz
23+
- mingw-w64-ucrt-x86_64-arrow-8.0.0-1-any.pkg.tar.xz
24+
- mingw-w64-ucrt-x86_64-aws-sdk-cpp-1.7.365-1-any.pkg.tar.xz
25+
- mingw-w64-ucrt-x86_64-brotli-1.0.9-4-any.pkg.tar.xz
26+
- mingw-w64-ucrt-x86_64-openssl-1.1.1.m-9800-any.pkg.tar.xz
27+
- mingw-w64-ucrt-x86_64-lz4-1.8.2-1-any.pkg.tar.xz
28+
- mingw-w64-ucrt-x86_64-re2-20200801-1-any.pkg.tar.xz
29+
- mingw-w64-ucrt-x86_64-snappy-1.1.7-2-any.pkg.tar.xz
30+
- mingw-w64-ucrt-x86_64-thrift-0.13.0-1-any.pkg.tar.xz
31+
- mingw-w64-ucrt-x86_64-zstd-1.4.4-1-any.pkg.tar.xz
32+
- mingw-w64-ucrt-x86_64-libutf8proc-2.4.0-2-any.pkg.tar.xz

include/arrow/api.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@
3838
#include "arrow/tensor.h" // IYWU pragma: export
3939
#include "arrow/type.h" // IYWU pragma: export
4040
#include "arrow/util/key_value_metadata.h" // IWYU pragma: export
41+
#include "arrow/visit_array_inline.h" // IYWU pragma: export
42+
#include "arrow/visit_scalar_inline.h" // IYWU pragma: export
4143
#include "arrow/visitor.h" // IYWU pragma: export
4244

4345
/// \brief Top-level namespace for Apache Arrow C++ API

include/arrow/array/array_base.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ class ARROW_EXPORT Array {
133133
int64_t end_idx, int64_t other_start_idx,
134134
const EqualOptions& = EqualOptions::Defaults()) const;
135135

136+
/// \brief Apply the ArrayVisitor::Visit() method specialized to the array type
136137
Status Accept(ArrayVisitor* visitor) const;
137138

138139
/// Construct a zero-copy view of this array with the given type.
@@ -187,10 +188,11 @@ class ARROW_EXPORT Array {
187188
Status ValidateFull() const;
188189

189190
protected:
190-
Array() : null_bitmap_data_(NULLPTR) {}
191+
Array() = default;
192+
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
191193

192194
std::shared_ptr<ArrayData> data_;
193-
const uint8_t* null_bitmap_data_;
195+
const uint8_t* null_bitmap_data_ = NULLPTR;
194196

195197
/// Protected method for constructors
196198
void SetData(const std::shared_ptr<ArrayData>& data) {
@@ -204,6 +206,8 @@ class ARROW_EXPORT Array {
204206

205207
private:
206208
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);
209+
210+
ARROW_EXPORT friend void PrintTo(const Array& x, std::ostream* os);
207211
};
208212

209213
static inline std::ostream& operator<<(std::ostream& os, const Array& x) {

include/arrow/array/array_nested.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,10 @@ class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
129129
const Array& offsets, const Array& values,
130130
MemoryPool* pool = default_memory_pool());
131131

132+
static Result<std::shared_ptr<ListArray>> FromArrays(
133+
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
134+
MemoryPool* pool = default_memory_pool());
135+
132136
/// \brief Return an Array that is a concatenation of the lists in this array.
133137
///
134138
/// Note that it's different from `values()` in that it takes into
@@ -138,6 +142,10 @@ class ARROW_EXPORT ListArray : public BaseListArray<ListType> {
138142
MemoryPool* memory_pool = default_memory_pool()) const;
139143

140144
/// \brief Return list offsets as an Int32Array
145+
///
146+
/// The returned array will not have a validity bitmap, so you cannot expect
147+
/// to pass it to ListArray::FromArrays() and get back the same list array
148+
/// if the original one has nulls.
141149
std::shared_ptr<Array> offsets() const;
142150

143151
protected:
@@ -174,6 +182,10 @@ class ARROW_EXPORT LargeListArray : public BaseListArray<LargeListType> {
174182
const Array& offsets, const Array& values,
175183
MemoryPool* pool = default_memory_pool());
176184

185+
static Result<std::shared_ptr<LargeListArray>> FromArrays(
186+
std::shared_ptr<DataType> type, const Array& offsets, const Array& values,
187+
MemoryPool* pool = default_memory_pool());
188+
177189
/// \brief Return an Array that is a concatenation of the lists in this array.
178190
///
179191
/// Note that it's different from `values()` in that it takes into
@@ -311,6 +323,14 @@ class ARROW_EXPORT FixedSizeListArray : public Array {
311323
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
312324
int32_t list_size);
313325

326+
/// \brief Construct FixedSizeListArray from child value array and type
327+
///
328+
/// \param[in] values Array containing list values
329+
/// \param[in] type The fixed sized list type
330+
/// \return Will have length equal to values.length() / type.list_size()
331+
static Result<std::shared_ptr<Array>> FromArrays(const std::shared_ptr<Array>& values,
332+
std::shared_ptr<DataType> type);
333+
314334
protected:
315335
void SetData(const std::shared_ptr<ArrayData>& data);
316336
int32_t list_size_;

include/arrow/array/builder_adaptive.h

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,9 +48,11 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
4848
/// \param[in] length the number of nulls to append
4949
Status AppendNulls(int64_t length) final {
5050
ARROW_RETURN_NOT_OK(CommitPendingData());
51-
ARROW_RETURN_NOT_OK(Reserve(length));
52-
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
53-
UnsafeSetNull(length);
51+
if (ARROW_PREDICT_TRUE(length > 0)) {
52+
ARROW_RETURN_NOT_OK(Reserve(length));
53+
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
54+
UnsafeSetNull(length);
55+
}
5456
return Status::OK();
5557
}
5658

@@ -70,9 +72,11 @@ class ARROW_EXPORT AdaptiveIntBuilderBase : public ArrayBuilder {
7072

7173
Status AppendEmptyValues(int64_t length) final {
7274
ARROW_RETURN_NOT_OK(CommitPendingData());
73-
ARROW_RETURN_NOT_OK(Reserve(length));
74-
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
75-
UnsafeSetNotNull(length);
75+
if (ARROW_PREDICT_TRUE(length > 0)) {
76+
ARROW_RETURN_NOT_OK(Reserve(length));
77+
memset(data_->mutable_data() + length_ * int_size_, 0, int_size_ * length);
78+
UnsafeSetNotNull(length);
79+
}
7680
return Status::OK();
7781
}
7882

include/arrow/array/builder_base.h

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
#include "arrow/array/array_primitive.h"
2929
#include "arrow/buffer.h"
3030
#include "arrow/buffer_builder.h"
31+
#include "arrow/result.h"
3132
#include "arrow/status.h"
3233
#include "arrow/type_fwd.h"
3334
#include "arrow/util/macros.h"
@@ -306,13 +307,27 @@ ARROW_EXPORT
306307
Status MakeBuilder(MemoryPool* pool, const std::shared_ptr<DataType>& type,
307308
std::unique_ptr<ArrayBuilder>* out);
308309

310+
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilder(
311+
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
312+
std::unique_ptr<ArrayBuilder> out;
313+
ARROW_RETURN_NOT_OK(MakeBuilder(pool, type, &out));
314+
return std::move(out);
315+
}
316+
309317
/// \brief Construct an empty ArrayBuilder corresponding to the data
310318
/// type, where any top-level or nested dictionary builders return the
311319
/// exact index type specified by the type.
312320
ARROW_EXPORT
313321
Status MakeBuilderExactIndex(MemoryPool* pool, const std::shared_ptr<DataType>& type,
314322
std::unique_ptr<ArrayBuilder>* out);
315323

324+
inline Result<std::unique_ptr<ArrayBuilder>> MakeBuilderExactIndex(
325+
const std::shared_ptr<DataType>& type, MemoryPool* pool = default_memory_pool()) {
326+
std::unique_ptr<ArrayBuilder> out;
327+
ARROW_RETURN_NOT_OK(MakeBuilderExactIndex(pool, type, &out));
328+
return std::move(out);
329+
}
330+
316331
/// \brief Construct an empty DictionaryBuilder initialized optionally
317332
/// with a pre-existing dictionary
318333
/// \param[in] pool the MemoryPool to use for allocations
@@ -324,4 +339,12 @@ Status MakeDictionaryBuilder(MemoryPool* pool, const std::shared_ptr<DataType>&
324339
const std::shared_ptr<Array>& dictionary,
325340
std::unique_ptr<ArrayBuilder>* out);
326341

342+
inline Result<std::unique_ptr<ArrayBuilder>> MakeDictionaryBuilder(
343+
const std::shared_ptr<DataType>& type, const std::shared_ptr<Array>& dictionary,
344+
MemoryPool* pool = default_memory_pool()) {
345+
std::unique_ptr<ArrayBuilder> out;
346+
ARROW_RETURN_NOT_OK(MakeDictionaryBuilder(pool, type, dictionary, &out));
347+
return std::move(out);
348+
}
349+
327350
} // namespace arrow

include/arrow/buffer.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -255,6 +255,13 @@ class ARROW_EXPORT Buffer {
255255
static Result<std::shared_ptr<Buffer>> Copy(std::shared_ptr<Buffer> source,
256256
const std::shared_ptr<MemoryManager>& to);
257257

258+
/// \brief Copy a non-owned buffer
259+
///
260+
/// This is useful for cases where the source memory area is externally managed
261+
/// (its lifetime not tied to the source Buffer), otherwise please use Copy().
262+
static Result<std::unique_ptr<Buffer>> CopyNonOwned(
263+
const Buffer& source, const std::shared_ptr<MemoryManager>& to);
264+
258265
/// \brief View buffer
259266
///
260267
/// Return a Buffer that reflects this buffer, seen potentially from another

include/arrow/chunk_resolver.h

Lines changed: 104 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,104 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
#pragma once
19+
20+
#include <atomic>
21+
#include <cstdint>
22+
#include <vector>
23+
24+
#include "arrow/type_fwd.h"
25+
#include "arrow/util/macros.h"
26+
27+
namespace arrow {
28+
namespace internal {
29+
30+
struct ChunkLocation {
31+
int64_t chunk_index, index_in_chunk;
32+
};
33+
34+
// An object that resolves an array chunk depending on a logical index
35+
struct ChunkResolver {
36+
explicit ChunkResolver(const ArrayVector& chunks);
37+
38+
explicit ChunkResolver(const std::vector<const Array*>& chunks);
39+
40+
explicit ChunkResolver(const RecordBatchVector& batches);
41+
42+
ChunkResolver(ChunkResolver&& other)
43+
: offsets_(std::move(other.offsets_)), cached_chunk_(other.cached_chunk_.load()) {}
44+
45+
ChunkResolver& operator=(ChunkResolver&& other) {
46+
offsets_ = std::move(other.offsets_);
47+
cached_chunk_.store(other.cached_chunk_.load());
48+
return *this;
49+
}
50+
51+
/// \brief Return a ChunkLocation containing the chunk index and in-chunk value index of
52+
/// the chunked array at logical index
53+
inline ChunkLocation Resolve(const int64_t index) const {
54+
// It is common for the algorithms below to make consecutive accesses at
55+
// a relatively small distance from each other, hence often falling in
56+
// the same chunk.
57+
// This is trivial when merging (assuming each side of the merge uses
58+
// its own resolver), but also in the inner recursive invocations of
59+
// partitioning.
60+
if (offsets_.size() <= 1) {
61+
return {0, index};
62+
}
63+
const auto cached_chunk = cached_chunk_.load();
64+
const bool cache_hit =
65+
(index >= offsets_[cached_chunk] && index < offsets_[cached_chunk + 1]);
66+
if (ARROW_PREDICT_TRUE(cache_hit)) {
67+
return {cached_chunk, index - offsets_[cached_chunk]};
68+
}
69+
auto chunk_index = Bisect(index);
70+
cached_chunk_.store(chunk_index);
71+
return {chunk_index, index - offsets_[chunk_index]};
72+
}
73+
74+
protected:
75+
// Find the chunk index corresponding to a value index using binary search
76+
inline int64_t Bisect(const int64_t index) const {
77+
// Like std::upper_bound(), but hand-written as it can help the compiler.
78+
// Search [lo, lo + n)
79+
int64_t lo = 0;
80+
auto n = static_cast<int64_t>(offsets_.size());
81+
while (n > 1) {
82+
const int64_t m = n >> 1;
83+
const int64_t mid = lo + m;
84+
if (static_cast<int64_t>(index) >= offsets_[mid]) {
85+
lo = mid;
86+
n -= m;
87+
} else {
88+
n = m;
89+
}
90+
}
91+
return lo;
92+
}
93+
94+
private:
95+
// Collection of starting offsets used for binary search
96+
std::vector<int64_t> offsets_;
97+
98+
// Tracks the most recently used chunk index to allow fast
99+
// access for consecutive indices corresponding to the same chunk
100+
mutable std::atomic<int64_t> cached_chunk_;
101+
};
102+
103+
} // namespace internal
104+
} // namespace arrow

include/arrow/chunked_array.h

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include <utility>
2424
#include <vector>
2525

26+
#include "arrow/chunk_resolver.h"
2627
#include "arrow/compare.h"
2728
#include "arrow/result.h"
2829
#include "arrow/status.h"
@@ -177,11 +178,12 @@ class ARROW_EXPORT ChunkedArray {
177178

178179
protected:
179180
ArrayVector chunks_;
181+
std::shared_ptr<DataType> type_;
180182
int64_t length_;
181183
int64_t null_count_;
182-
std::shared_ptr<DataType> type_;
183184

184185
private:
186+
internal::ChunkResolver chunk_resolver_;
185187
ARROW_DISALLOW_COPY_AND_ASSIGN(ChunkedArray);
186188
};
187189

0 commit comments

Comments
 (0)