Skip to content

Commit 55587ef

Browse files
authored
GH-47859: [C++] Fix creating union types without type_codes for fields.size() == 128 (#47815)
### Rationale for this change There is a bug for creating union types with empty `type_codes`. If `fields.size()` == 128 (`kMaxTypeCode` + 1) and `type_codes` is empty, static_cast<int8_t> returns -128 and `internal::Iota` generates an empty vector of type codes, but the expected vector is [0, 1, 2, ..., 127], where 127 is `kMaxTypeCode`. ### What changes are included in this PR? - Added a new `internal::Iota` function to generate vectors of size = `length` with values from `start`. - Changed `internal::Iota` call from old parameters to new for creating `dense_union` and `sparse_union` types. - Implemented a new test to detect this error. ### Are these changes tested? Yes, there is a new test. ### Are there any user-facing changes? No. **This PR contains a "Critical Fix".** (b) a bug that caused incorrect or invalid data to be produced If `fields.size()` == 128 (`kMaxTypeCode` + 1), `internal::Iota` returns an empty vector that cannot validate here: https://github.com/apache/arrow/blob/main/cpp/src/arrow/type.cc#L1232 * GitHub Issue: #47859 Lead-authored-by: Daniil Timizhev <[email protected]> Co-authored-by: Daniil Timižev <[email protected]> Signed-off-by: Sutou Kouhei <[email protected]>
1 parent 4cedc9d commit 55587ef

File tree

4 files changed

+45
-8
lines changed

4 files changed

+45
-8
lines changed

cpp/src/arrow/type.cc

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3270,15 +3270,15 @@ std::shared_ptr<DataType> run_end_encoded(std::shared_ptr<DataType> run_end_type
32703270
std::shared_ptr<DataType> sparse_union(FieldVector child_fields,
32713271
std::vector<int8_t> type_codes) {
32723272
if (type_codes.empty()) {
3273-
type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
3273+
type_codes = internal::Iota<int8_t>(0, child_fields.size());
32743274
}
32753275
return std::make_shared<SparseUnionType>(std::move(child_fields),
32763276
std::move(type_codes));
32773277
}
32783278
std::shared_ptr<DataType> dense_union(FieldVector child_fields,
32793279
std::vector<int8_t> type_codes) {
32803280
if (type_codes.empty()) {
3281-
type_codes = internal::Iota(static_cast<int8_t>(child_fields.size()));
3281+
type_codes = internal::Iota<int8_t>(0, child_fields.size());
32823282
}
32833283
return std::make_shared<DenseUnionType>(std::move(child_fields), std::move(type_codes));
32843284
}
@@ -3310,7 +3310,7 @@ std::shared_ptr<DataType> sparse_union(const ArrayVector& children,
33103310
std::vector<std::string> field_names,
33113311
std::vector<int8_t> type_codes) {
33123312
if (type_codes.empty()) {
3313-
type_codes = internal::Iota(static_cast<int8_t>(children.size()));
3313+
type_codes = internal::Iota<int8_t>(0, children.size());
33143314
}
33153315
auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
33163316
return sparse_union(std::move(fields), std::move(type_codes));
@@ -3320,7 +3320,7 @@ std::shared_ptr<DataType> dense_union(const ArrayVector& children,
33203320
std::vector<std::string> field_names,
33213321
std::vector<int8_t> type_codes) {
33223322
if (type_codes.empty()) {
3323-
type_codes = internal::Iota(static_cast<int8_t>(children.size()));
3323+
type_codes = internal::Iota<int8_t>(0, children.size());
33243324
}
33253325
auto fields = FieldsFromArraysAndNames(std::move(field_names), children);
33263326
return dense_union(std::move(fields), std::move(type_codes));

cpp/src/arrow/type_test.cc

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <cstdint>
2323
#include <functional>
2424
#include <memory>
25+
#include <numeric>
2526
#include <string>
2627
#include <unordered_set>
2728
#include <vector>
@@ -2191,6 +2192,36 @@ TEST(TestUnionType, Basics) {
21912192
ASSERT_EQ(ty6->child_ids(), child_ids2);
21922193
}
21932194

2195+
TEST(TestUnionType, MaxTypeCode) {
2196+
std::vector<std::shared_ptr<Field>> fields;
2197+
for (int32_t i = 0; i <= UnionType::kMaxTypeCode; i++) {
2198+
fields.push_back(field(std::to_string(i), int32()));
2199+
}
2200+
2201+
std::vector<int8_t> type_codes(fields.size());
2202+
std::iota(type_codes.begin(), type_codes.end(), 0);
2203+
2204+
auto t1 = checked_pointer_cast<UnionType>(dense_union(fields, type_codes));
2205+
ASSERT_EQ(t1->type_codes().size(), UnionType::kMaxTypeCode + 1);
2206+
ASSERT_EQ(t1->child_ids().size(), UnionType::kMaxTypeCode + 1);
2207+
2208+
auto t2 = checked_pointer_cast<UnionType>(dense_union(fields));
2209+
ASSERT_EQ(t2->type_codes().size(), UnionType::kMaxTypeCode + 1);
2210+
ASSERT_EQ(t2->child_ids().size(), UnionType::kMaxTypeCode + 1);
2211+
2212+
AssertTypeEqual(*t1, *t2);
2213+
2214+
auto t3 = checked_pointer_cast<UnionType>(sparse_union(fields, type_codes));
2215+
ASSERT_EQ(t3->type_codes().size(), UnionType::kMaxTypeCode + 1);
2216+
ASSERT_EQ(t3->child_ids().size(), UnionType::kMaxTypeCode + 1);
2217+
2218+
auto t4 = checked_pointer_cast<UnionType>(sparse_union(fields));
2219+
ASSERT_EQ(t4->type_codes().size(), UnionType::kMaxTypeCode + 1);
2220+
ASSERT_EQ(t4->child_ids().size(), UnionType::kMaxTypeCode + 1);
2221+
2222+
AssertTypeEqual(*t3, *t4);
2223+
}
2224+
21942225
TEST(TestDictionaryType, Basics) {
21952226
auto value_type = int32();
21962227

cpp/src/arrow/util/range.h

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -27,15 +27,21 @@
2727

2828
namespace arrow::internal {
2929

30+
/// Create a vector containing the values from start with length elements
31+
template <typename T>
32+
std::vector<T> Iota(T start, size_t length) {
33+
std::vector<T> result(length);
34+
std::iota(result.begin(), result.end(), start);
35+
return result;
36+
}
37+
3038
/// Create a vector containing the values from start up to stop
3139
template <typename T>
3240
std::vector<T> Iota(T start, T stop) {
3341
if (start > stop) {
3442
return {};
3543
}
36-
std::vector<T> result(static_cast<size_t>(stop - start));
37-
std::iota(result.begin(), result.end(), start);
38-
return result;
44+
return Iota<T>(start, static_cast<size_t>(stop - start));
3945
}
4046

4147
/// Create a vector containing the values from 0 up to length

docs/source/format/Columnar.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -880,7 +880,7 @@ each value. Its physical layout is as follows:
880880
* One child array for each type
881881
* Types buffer: A buffer of 8-bit signed integers. Each type in the
882882
union has a corresponding type id whose values are found in this
883-
buffer. A union with more than 127 possible types can be modeled as
883+
buffer. A union with more than 128 possible types can be modeled as
884884
a union of unions.
885885
* Offsets buffer: A buffer of signed Int32 values indicating the
886886
relative offset into the respective child array for the type in a

0 commit comments

Comments
 (0)