Skip to content

[Variant] Fix broken metadata builder rollback #8135

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 87 additions & 12 deletions parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ impl ParentState<'_> {
}
}

// Return the offset of the underlying buffer at the time of calling this method.
// Return the current offset of the underlying buffer. Used as a savepoint for rollback.
fn buffer_current_offset(&self) -> usize {
match self {
ParentState::Variant { buffer, .. }
Expand All @@ -678,8 +678,9 @@ impl ParentState<'_> {
}
}

// Return the current index of the undelying metadata buffer at the time of calling this method.
fn metadata_current_offset(&self) -> usize {
// Return the current dictionary size of the undelying metadata builder. Used as a savepoint for
// rollback.
fn metadata_num_fields(&self) -> usize {
match self {
ParentState::Variant {
metadata_builder, ..
Expand All @@ -689,7 +690,7 @@ impl ParentState<'_> {
}
| ParentState::List {
metadata_builder, ..
} => metadata_builder.metadata_buffer.len(),
} => metadata_builder.field_names.len(),
}
}
}
Expand Down Expand Up @@ -1021,6 +1022,28 @@ impl VariantBuilder {
self
}

/// Builder-style API for appending a value to the list and returning self to enable method chaining.
///
/// # Panics
///
/// This method will panic if the variant contains duplicate field names in objects
/// when validation is enabled. For a fallible version, use [`ListBuilder::try_with_value`].
pub fn with_value<'m, 'd, T: Into<Variant<'m, 'd>>>(mut self, value: T) -> Self {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

self.append_value(value);
self
}

/// Builder-style API for appending a value to the list and returns self for method chaining.
///
/// This is the fallible version of [`ListBuilder::with_value`].
pub fn try_with_value<'m, 'd, T: Into<Variant<'m, 'd>>>(
mut self,
value: T,
) -> Result<Self, ArrowError> {
self.try_append_value(value)?;
Ok(self)
}

/// This method reserves capacity for field names in the Variant metadata,
/// which can improve performance when you know the approximate number of unique field
/// names that will be used across all objects in the [`Variant`].
Expand Down Expand Up @@ -1130,7 +1153,7 @@ pub struct ListBuilder<'a> {
impl<'a> ListBuilder<'a> {
fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self {
let parent_value_offset_base = parent_state.buffer_current_offset();
let parent_metadata_offset_base = parent_state.metadata_current_offset();
let parent_metadata_offset_base = parent_state.metadata_num_fields();
Self {
parent_state,
offsets: vec![],
Expand Down Expand Up @@ -1312,7 +1335,7 @@ pub struct ObjectBuilder<'a> {
impl<'a> ObjectBuilder<'a> {
fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self {
let offset_base = parent_state.buffer_current_offset();
let meta_offset_base = parent_state.metadata_current_offset();
let meta_offset_base = parent_state.metadata_num_fields();
Self {
parent_state,
fields: IndexMap::new(),
Expand Down Expand Up @@ -2928,13 +2951,16 @@ mod tests {
// The parent object should only contain the original fields
object_builder.finish().unwrap();
let (metadata, value) = builder.finish();

let metadata = VariantMetadata::try_new(&metadata).unwrap();
assert_eq!(metadata.len(), 1);
assert_eq!(&metadata[0], "second");
assert_eq!(metadata.len(), 2);
assert_eq!(&metadata[0], "first");
assert_eq!(&metadata[1], "second");

let variant = Variant::try_new_with_metadata(metadata, &value).unwrap();
let obj = variant.as_object().unwrap();
assert_eq!(obj.len(), 1);
assert_eq!(obj.len(), 2);
assert_eq!(obj.get("first"), Some(Variant::Int8(1)));
assert_eq!(obj.get("second"), Some(Variant::Int8(2)));
}

Expand Down Expand Up @@ -2979,13 +3005,16 @@ mod tests {
// The parent object should only contain the original fields
object_builder.finish().unwrap();
let (metadata, value) = builder.finish();

let metadata = VariantMetadata::try_new(&metadata).unwrap();
assert_eq!(metadata.len(), 1); // the fields of nested_object_builder has been rolled back
assert_eq!(&metadata[0], "second");
assert_eq!(metadata.len(), 2); // the fields of nested_object_builder has been rolled back
assert_eq!(&metadata[0], "first");
assert_eq!(&metadata[1], "second");

let variant = Variant::try_new_with_metadata(metadata, &value).unwrap();
let obj = variant.as_object().unwrap();
assert_eq!(obj.len(), 1);
assert_eq!(obj.len(), 2);
assert_eq!(obj.get("first"), Some(Variant::Int8(1)));
assert_eq!(obj.get("second"), Some(Variant::Int8(2)));
}

Expand Down Expand Up @@ -3121,4 +3150,50 @@ mod tests {

builder.finish()
}

// Make sure that we can correctly build deeply nested objects even when some of the nested
// builders don't finish.
#[test]
fn test_append_list_object_list_object() {
// An infinite counter
let mut counter = 0..;
let mut take = move |i| (&mut counter).take(i).collect::<Vec<_>>();
let mut builder = VariantBuilder::new();
let skip = 5;
{
let mut list = builder.new_list();
for i in take(4) {
let mut object = list.new_object();
for i in take(4) {
let field_name = format!("field{i}");
let mut list = object.new_list(&field_name);
for i in take(3) {
let mut object = list.new_object();
for i in take(3) {
if i % skip != 0 {
object.insert(&format!("field{i}"), i);
}
}
if i % skip != 0 {
object.finish().unwrap();
}
}
if i % skip != 0 {
list.finish();
}
}
if i % skip != 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test!

Do we need to make the finish conditions for the inner and outer objects different? so that the field names added by the inner and outer objects are different.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each loop has a different i (they shadow each other). No field name is used twice, and skipped field names are not used at all.

object.finish().unwrap();
}
}
list.finish();
}
let (metadata, value) = builder.finish();
let v1 = Variant::try_new(&metadata, &value).unwrap();

let (metadata, value) = VariantBuilder::new().with_value(v1.clone()).finish();
let v2 = Variant::try_new(&metadata, &value).unwrap();

assert_eq!(format!("{v1:?}"), format!("{v2:?}"));
}
}
Loading