Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion parquet-variant-compute/src/variant_get/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,7 @@ mod test {

let err = variant_get(&array, options).unwrap_err();
// TODO make this error message nicer (not Debug format)
assert_eq!(err.to_string(), "Cast error: Failed to extract primitive of type Int32 from variant ShortString(ShortString(\"n/a\")) at path VariantPath([])");
assert_eq!(err.to_string(), "Cast error: Failed to extract primitive of type Int32 from variant ShortString(\"n/a\") at path VariantPath([])");
}

/// Perfect Shredding: extract the typed value as a VariantArray
Expand Down
99 changes: 87 additions & 12 deletions parquet-variant/src/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -669,7 +669,7 @@ impl ParentState<'_> {
}
}

// Return the offset of the underlying buffer at the time of calling this method.
// Return the current offset of the underlying buffer. Used as a savepoint for rollback.
fn buffer_current_offset(&self) -> usize {
match self {
ParentState::Variant { buffer, .. }
Expand All @@ -678,8 +678,9 @@ impl ParentState<'_> {
}
}

// Return the current index of the undelying metadata buffer at the time of calling this method.
fn metadata_current_offset(&self) -> usize {
// Return the current dictionary size of the undelying metadata builder. Used as a savepoint for
// rollback.
fn metadata_num_fields(&self) -> usize {
match self {
ParentState::Variant {
metadata_builder, ..
Expand All @@ -689,7 +690,7 @@ impl ParentState<'_> {
}
| ParentState::List {
metadata_builder, ..
} => metadata_builder.metadata_buffer.len(),
} => metadata_builder.field_names.len(),
}
}
}
Expand Down Expand Up @@ -1021,6 +1022,28 @@ impl VariantBuilder {
self
}

/// Builder-style API for appending a value to the list and returning self to enable method chaining.
///
/// # Panics
///
/// This method will panic if the variant contains duplicate field names in objects
/// when validation is enabled. For a fallible version, use [`ListBuilder::try_with_value`].
pub fn with_value<'m, 'd, T: Into<Variant<'m, 'd>>>(mut self, value: T) -> Self {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice!

self.append_value(value);
self
}

/// Builder-style API for appending a value to the list and returns self for method chaining.
///
/// This is the fallible version of [`ListBuilder::with_value`].
pub fn try_with_value<'m, 'd, T: Into<Variant<'m, 'd>>>(
mut self,
value: T,
) -> Result<Self, ArrowError> {
self.try_append_value(value)?;
Ok(self)
}

/// This method reserves capacity for field names in the Variant metadata,
/// which can improve performance when you know the approximate number of unique field
/// names that will be used across all objects in the [`Variant`].
Expand Down Expand Up @@ -1130,7 +1153,7 @@ pub struct ListBuilder<'a> {
impl<'a> ListBuilder<'a> {
fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self {
let parent_value_offset_base = parent_state.buffer_current_offset();
let parent_metadata_offset_base = parent_state.metadata_current_offset();
let parent_metadata_offset_base = parent_state.metadata_num_fields();
Self {
parent_state,
offsets: vec![],
Expand Down Expand Up @@ -1312,7 +1335,7 @@ pub struct ObjectBuilder<'a> {
impl<'a> ObjectBuilder<'a> {
fn new(parent_state: ParentState<'a>, validate_unique_fields: bool) -> Self {
let offset_base = parent_state.buffer_current_offset();
let meta_offset_base = parent_state.metadata_current_offset();
let meta_offset_base = parent_state.metadata_num_fields();
Self {
parent_state,
fields: IndexMap::new(),
Expand Down Expand Up @@ -2928,13 +2951,16 @@ mod tests {
// The parent object should only contain the original fields
object_builder.finish().unwrap();
let (metadata, value) = builder.finish();

let metadata = VariantMetadata::try_new(&metadata).unwrap();
assert_eq!(metadata.len(), 1);
assert_eq!(&metadata[0], "second");
assert_eq!(metadata.len(), 2);
assert_eq!(&metadata[0], "first");
assert_eq!(&metadata[1], "second");

let variant = Variant::try_new_with_metadata(metadata, &value).unwrap();
let obj = variant.as_object().unwrap();
assert_eq!(obj.len(), 1);
assert_eq!(obj.len(), 2);
assert_eq!(obj.get("first"), Some(Variant::Int8(1)));
assert_eq!(obj.get("second"), Some(Variant::Int8(2)));
}

Expand Down Expand Up @@ -2979,13 +3005,16 @@ mod tests {
// The parent object should only contain the original fields
object_builder.finish().unwrap();
let (metadata, value) = builder.finish();

let metadata = VariantMetadata::try_new(&metadata).unwrap();
assert_eq!(metadata.len(), 1); // the fields of nested_object_builder has been rolled back
assert_eq!(&metadata[0], "second");
assert_eq!(metadata.len(), 2); // the fields of nested_object_builder has been rolled back
assert_eq!(&metadata[0], "first");
assert_eq!(&metadata[1], "second");

let variant = Variant::try_new_with_metadata(metadata, &value).unwrap();
let obj = variant.as_object().unwrap();
assert_eq!(obj.len(), 1);
assert_eq!(obj.len(), 2);
assert_eq!(obj.get("first"), Some(Variant::Int8(1)));
assert_eq!(obj.get("second"), Some(Variant::Int8(2)));
}

Expand Down Expand Up @@ -3121,4 +3150,50 @@ mod tests {

builder.finish()
}

// Make sure that we can correctly build deeply nested objects even when some of the nested
// builders don't finish.
#[test]
fn test_append_list_object_list_object() {
// An infinite counter
let mut counter = 0..;
let mut take = move |i| (&mut counter).take(i).collect::<Vec<_>>();
let mut builder = VariantBuilder::new();
let skip = 5;
{
let mut list = builder.new_list();
for i in take(4) {
let mut object = list.new_object();
for i in take(4) {
let field_name = format!("field{i}");
let mut list = object.new_list(&field_name);
for i in take(3) {
let mut object = list.new_object();
for i in take(3) {
if i % skip != 0 {
object.insert(&format!("field{i}"), i);
}
}
if i % skip != 0 {
object.finish().unwrap();
}
}
if i % skip != 0 {
list.finish();
}
}
if i % skip != 0 {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice test!

Do we need to make the finish conditions for the inner and outer objects different? so that the field names added by the inner and outer objects are different.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Each loop has a different i (they shadow each other). No field name is used twice, and skipped field names are not used at all.

object.finish().unwrap();
}
}
list.finish();
}
let (metadata, value) = builder.finish();
let v1 = Variant::try_new(&metadata, &value).unwrap();

let (metadata, value) = VariantBuilder::new().with_value(v1.clone()).finish();
let v2 = Variant::try_new(&metadata, &value).unwrap();

assert_eq!(format!("{v1:?}"), format!("{v2:?}"));
}
}
55 changes: 54 additions & 1 deletion parquet-variant/src/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ impl Deref for ShortString<'_> {
/// [metadata]: VariantMetadata#Validation
/// [object]: VariantObject#Validation
/// [array]: VariantList#Validation
#[derive(Debug, Clone, PartialEq)]
#[derive(Clone, PartialEq)]
pub enum Variant<'m, 'v> {
/// Primitive type: Null
Null,
Expand Down Expand Up @@ -1286,6 +1286,59 @@ impl TryFrom<(i128, u8)> for Variant<'_, '_> {
}
}

impl std::fmt::Debug for Variant<'_, '_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
match self {
Variant::Null => write!(f, "null"),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think for the debug printing we should also include the variant enum, otherwise one wouldn't be able to tell the difference between Variant::Int8(42) and Variant::Int16(42) from the debug output, as they would both be rendered as 42

So perhaps we can keep

Suggested change
Variant::Null => write!(f, "null"),
Variant::Null => write!(f, "Variant::Null"),

And then apply some nicer formatting to the binary / object ones

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I pulled out the impl Debug as its own PR that should merge first:

Let's pick up the conversation there as needed, this PR will shrink a lot after rebasing on top of it.

Variant::BooleanTrue => write!(f, "true"),
Variant::BooleanFalse => write!(f, "false"),
Variant::Int8(v) => write!(f, "{v}"),
Variant::Int16(v) => write!(f, "{v}"),
Variant::Int32(v) => write!(f, "{v}"),
Variant::Int64(v) => write!(f, "{v}"),
Variant::Float(v) => write!(f, "{v}"),
Variant::Double(v) => write!(f, "{v}"),
Variant::Decimal4(d) => write!(f, "{d}"),
Variant::Decimal8(d) => write!(f, "{d}"),
Variant::Decimal16(d) => write!(f, "{d}"),
Variant::Date(d) => write!(f, "\"{}\"", d.format("%Y-%m-%d")),
Variant::TimestampMicros(ts) => write!(f, "\"{}\"", ts.to_rfc3339()),
Variant::TimestampNtzMicros(ts) => {
write!(f, "\"{}\"", ts.format("%Y-%m-%dT%H:%M:%S%.6f"))
}
Variant::Binary(bytes) => {
write!(f, "\"0x")?;
for b in *bytes {
write!(f, "{:02x}", b)?;
}
write!(f, "\"")
}
Variant::String(s) => write!(f, "{s:?}"),
Variant::ShortString(s) => write!(f, "{s:?}"),
Variant::Object(obj) => {
let mut map = f.debug_map();
for res in obj.iter_try() {
match res {
Ok((k, v)) => map.entry(&k, &v),
Err(_) => map.entry(&"<invalid>", &"<invalid>"),
};
}
map.finish()
}
Variant::List(arr) => {
let mut list = f.debug_list();
for res in arr.iter_try() {
match res {
Ok(v) => list.entry(&v),
Err(_) => list.entry(&"<invalid>"),
};
}
list.finish()
}
}
}
}

#[cfg(test)]
mod tests {

Expand Down
Loading