Skip to content

Commit d9a4b39

Browse files
alambmbrobbel
andauthored
Add variant_experimental feature to parquet crate (apache#8133)
# Which issue does this PR close? - Closes apache#8132 - Part of apache#8084 - Follow on to apache#8104 # Rationale for this change TLDR is we need a way to test and work out how Variant integration with the actual parquet reader/writer will look, so let's do it in the parquet crate. Please see the essay on apache#8132 for background Follow on tasks (I will file tickets for these items if we agree on this as an integration mechanism): - [x] Do not `panic` when writing VariantArray with the ArrowWriter: apache#8296 - [ ] Add some way to write the logical annotation to parquet metadata - [ ] Read arrays annotated with VARIANT logical type as VariantArrays in ArrowReader - [x] Update the variant_integration test to use `VariantArray` : apache#8084 - [x] Rename `variant_experimental` flag to `variant` and remove warnings about being experimental: apache#8297 Follow up tasks that came out of this PR but do not depend on it - [x] apache#8145 - [x] apache#8144 # What changes are included in this PR? 1. Add the `variant_experimental` feature to the `parquet` crate 2. Publicly export the variant crates 3. Add docs and examples # Are these changes tested? Yes by new CI # Are there any user-facing changes? This adds a new feature flag, and new --------- Co-authored-by: Matthijs Brobbel <[email protected]>
1 parent fb7d02e commit d9a4b39

File tree

7 files changed

+139
-3
lines changed

7 files changed

+139
-3
lines changed

.github/workflows/parquet.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -119,7 +119,9 @@ jobs:
119119
run: cargo check -p parquet --no-default-features --features flate2 --features flate2-rust_backened
120120
- name: Check compilation --no-default-features --features flate2 --features flate2-zlib-rs
121121
run: cargo check -p parquet --no-default-features --features flate2 --features flate2-zlib-rs
122-
122+
- name: Check compilation --no-default-features --features variant_experimental
123+
run: cargo check -p parquet --no-default-features --features variant_experimental
124+
123125

124126
# test the parquet crate builds against wasm32 in stable rust
125127
wasm32-build:

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,7 @@ parquet = { version = "56.1.0", path = "./parquet", default-features = false }
104104
# These crates have not yet been released and thus do not use the workspace version
105105
parquet-variant = { version = "0.1.0", path = "./parquet-variant" }
106106
parquet-variant-json = { version = "0.1.0", path = "./parquet-variant-json" }
107-
parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-json" }
107+
parquet-variant-compute = { version = "0.1.0", path = "./parquet-variant-compute" }
108108

109109
chrono = { version = "0.4.40", default-features = false, features = ["clock"] }
110110

parquet/Cargo.toml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,10 @@ arrow-data = { workspace = true, optional = true }
4545
arrow-schema = { workspace = true, optional = true }
4646
arrow-select = { workspace = true, optional = true }
4747
arrow-ipc = { workspace = true, optional = true }
48+
parquet-variant = { workspace = true, optional = true }
49+
parquet-variant-json = { workspace = true, optional = true }
50+
parquet-variant-compute = { workspace = true, optional = true }
51+
4852
object_store = { version = "0.12.0", default-features = false, optional = true }
4953

5054
bytes = { version = "1.1", default-features = false, features = ["std"] }
@@ -108,7 +112,7 @@ json = ["serde_json", "base64"]
108112
# Enable internal testing APIs
109113
test_common = ["arrow/test_utils"]
110114
# Experimental, unstable functionality primarily used for testing
111-
experimental = []
115+
experimental = ["variant_experimental"]
112116
# Enable async APIs
113117
async = ["futures", "tokio"]
114118
# Enable object_store integration
@@ -124,6 +128,8 @@ encryption = ["dep:ring"]
124128
# Explicitely enabling rust_backend and zlib-rs features for flate2
125129
flate2-rust_backened = ["flate2/rust_backend"]
126130
flate2-zlib-rs = ["flate2/zlib-rs"]
131+
# Enable parquet variant support
132+
variant_experimental = ["parquet-variant", "parquet-variant-json", "parquet-variant-compute"]
127133

128134

129135
[[example]]

parquet/README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,11 @@ The `parquet` crate provides the following features which may be enabled in your
6464
- `experimental` - Experimental APIs which may change, even between minor releases
6565
- `simdutf8` (default) - Use the [`simdutf8`] crate for SIMD-accelerated UTF-8 validation
6666
- `encryption` - support for reading / writing encrypted Parquet files
67+
- `variant_experimental` - ⚠️ Experimental [Parquet Variant] support, which may change, even between minor releases.
6768

6869
[`arrow`]: https://crates.io/crates/arrow
6970
[`simdutf8`]: https://crates.io/crates/simdutf8
71+
[parquet variant]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
7072

7173
## Parquet Feature Status
7274

parquet/src/lib.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,14 @@
8686
//! [`ParquetRecordBatchStreamBuilder`]: arrow::async_reader::ParquetRecordBatchStreamBuilder
8787
//! [`ParquetObjectReader`]: arrow::async_reader::ParquetObjectReader
8888
//!
89+
//! ## Variant Logical Type (`variant_experimental` feature)
90+
//!
91+
//! The [`variant`] module supports reading and writing Parquet files
92+
//! with the [Variant Binary Encoding] logical type, which can represent
93+
//! semi-structured data such as JSON efficiently.
94+
//!
95+
//! [Variant Binary Encoding]: https://github.com/apache/parquet-format/blob/master/VariantEncoding.md
96+
//!
8997
//! ## Read/Write Parquet Directly
9098
//!
9199
//! Workloads needing finer-grained control, or to avoid a dependence on arrow,
@@ -179,3 +187,6 @@ pub mod record;
179187
pub mod schema;
180188

181189
pub mod thrift;
190+
191+
#[cfg(feature = "variant_experimental")]
192+
pub mod variant;

parquet/src/variant.rs

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! ⚠️ Experimental Support for reading and writing [`Variant`]s to / from Parquet files ⚠️
19+
//!
20+
//! This is a 🚧 Work In Progress
21+
//!
22+
//! Note: Requires the `variant_experimental` feature of the `parquet` crate to be enabled.
23+
//!
24+
//! # Features
25+
//! * [`Variant`] represents variant value, which can be an object, list, or primitive.
26+
//! * [`VariantBuilder`] for building `Variant` values.
27+
//! * [`VariantArray`] for representing a column of Variant values.
28+
//! * [`compute`] module with functions for manipulating Variants, such as
29+
//! [`variant_get`] to extracting a value by path and functions to convert
30+
//! between `Variant` and JSON.
31+
//!
32+
//! [Variant Logical Type]: Variant
33+
//! [`VariantArray`]: compute::VariantArray
34+
//! [`variant_get`]: compute::variant_get
35+
//!
36+
//! # Example: Writing a Parquet file with Variant column
37+
//! ```rust
38+
//! # use parquet::variant::compute::{VariantArray, VariantArrayBuilder};
39+
//! # use parquet::variant::VariantBuilderExt;
40+
//! # use std::sync::Arc;
41+
//! # use arrow_array::{ArrayRef, RecordBatch};
42+
//! # use parquet::arrow::ArrowWriter;
43+
//! # fn main() -> Result<(), parquet::errors::ParquetError> {
44+
//! // Use the VariantArrayBuilder to build a VariantArray
45+
//! let mut builder = VariantArrayBuilder::new(3);
46+
//! // row 1: {"name": "Alice"}
47+
//! let mut variant_builder = builder.variant_builder();
48+
//! variant_builder.new_object().with_field("name", "Alice").finish();
49+
//! variant_builder.finish();
50+
//! let array = builder.build();
51+
//!
52+
//! // TODO support writing VariantArray directly
53+
//! // at the moment it panics when trying to downcast to a struct array
54+
//! // https://github.com/apache/arrow-rs/issues/8296
55+
//! // let array: ArrayRef = Arc::new(array);
56+
//! let array: ArrayRef = Arc::new(array.into_inner());
57+
//!
58+
//! // create a RecordBatch with the VariantArray
59+
//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
60+
//!
61+
//! // write the RecordBatch to a Parquet file
62+
//! let file = std::fs::File::create("variant.parquet")?;
63+
//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
64+
//! writer.write(&batch)?;
65+
//! writer.close()?;
66+
//!
67+
//! # std::fs::remove_file("variant.parquet")?;
68+
//! # Ok(())
69+
//! # }
70+
//! ```
71+
//!
72+
//! # Example: Writing JSON with a Parquet file with Variant column
73+
//! ```rust
74+
//! # use std::sync::Arc;
75+
//! # use arrow_array::{ArrayRef, RecordBatch, StringArray};
76+
//! # use parquet::variant::compute::json_to_variant;
77+
//! # use parquet::variant::compute::VariantArray;
78+
//! # use parquet::arrow::ArrowWriter;
79+
//! # fn main() -> Result<(), parquet::errors::ParquetError> {
80+
//! // Create an array of JSON strings, simulating a column of JSON data
81+
//! // TODO use StringViewArray when available
82+
//! let input_array = StringArray::from(vec![
83+
//! Some(r#"{"name": "Alice", "age": 30}"#),
84+
//! Some(r#"{"name": "Bob", "age": 25, "address": {"city": "New York"}}"#),
85+
//! None,
86+
//! Some("{}"),
87+
//! ]);
88+
//! let input_array: ArrayRef = Arc::new(input_array);
89+
//!
90+
//! // Convert the JSON strings to a VariantArray
91+
//! let array: VariantArray = json_to_variant(&input_array)?;
92+
//!
93+
//! // TODO support writing VariantArray directly
94+
//! // at the moment it panics when trying to downcast to a struct array
95+
//! // https://github.com/apache/arrow-rs/issues/8296
96+
//! // let array: ArrayRef = Arc::new(array);
97+
//! let array: ArrayRef = Arc::new(array.into_inner());
98+
//!
99+
//! // create a RecordBatch with the VariantArray
100+
//! let batch = RecordBatch::try_from_iter(vec![("data", array)])?;
101+
//!
102+
//! // write the RecordBatch to a Parquet file
103+
//! let file = std::fs::File::create("variant-json.parquet")?;
104+
//! let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?;
105+
//! writer.write(&batch)?;
106+
//! writer.close()?;
107+
//! # std::fs::remove_file("variant-json.parquet")?;
108+
//! # Ok(())
109+
//! # }
110+
//! ```
111+
//!
112+
//! # Example: Reading a Parquet file with Variant column
113+
//! (TODO: add example)
114+
pub use parquet_variant::*;
115+
pub use parquet_variant_compute as compute;
File renamed without changes.

0 commit comments

Comments
 (0)