Skip to content

[Varint] Implement ShreddingState::AllNull variant #8093

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
112 changes: 100 additions & 12 deletions parquet-variant-compute/src/variant_array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ impl VariantArray {
typed_value_to_variant(typed_value, index)
}
}
ShreddingState::AllNull { .. } => Variant::Null,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is tricky... see #8122 (comment)

  • For a top-level variant, null/null is illegal (tho returning Variant::Null is arguably a correct way to compensate)
  • For a shredded variant field, null/null means SQL NULL, and returning Variant::Null is arguably incorrect (causes SQL IS NULL operator to return FALSE). But we don't even have a way to return SQL NULL here (it would probably correspond to Option::None?)

}
}

Expand Down Expand Up @@ -226,8 +227,6 @@ impl VariantArray {
/// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding
#[derive(Debug)]
pub enum ShreddingState {
// TODO: add missing state where there is neither value nor typed_value
// Missing { metadata: BinaryViewArray },
/// This variant has no typed_value field
Unshredded {
metadata: BinaryViewArray,
Expand All @@ -250,6 +249,8 @@ pub enum ShreddingState {
value: BinaryViewArray,
typed_value: ArrayRef,
},
/// All values are null, only metadata is present
AllNull { metadata: BinaryViewArray },
}

impl ShreddingState {
Expand All @@ -270,9 +271,7 @@ impl ShreddingState {
metadata,
typed_value,
}),
(_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from(
"VariantArray has neither value nor typed_value field",
))),
(metadata, None, None) => Ok(Self::AllNull { metadata }),
}
}

Expand All @@ -282,6 +281,7 @@ impl ShreddingState {
ShreddingState::Unshredded { metadata, .. } => metadata,
ShreddingState::Typed { metadata, .. } => metadata,
ShreddingState::PartiallyShredded { metadata, .. } => metadata,
ShreddingState::AllNull { metadata } => metadata,
}
}

Expand All @@ -291,6 +291,7 @@ impl ShreddingState {
ShreddingState::Unshredded { value, .. } => Some(value),
ShreddingState::Typed { .. } => None,
ShreddingState::PartiallyShredded { value, .. } => Some(value),
ShreddingState::AllNull { .. } => None,
}
}

Expand All @@ -300,6 +301,7 @@ impl ShreddingState {
ShreddingState::Unshredded { .. } => None,
ShreddingState::Typed { typed_value, .. } => Some(typed_value),
ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value),
ShreddingState::AllNull { .. } => None,
}
}

Expand All @@ -326,6 +328,9 @@ impl ShreddingState {
value: value.slice(offset, length),
typed_value: typed_value.slice(offset, length),
},
ShreddingState::AllNull { metadata } => ShreddingState::AllNull {
metadata: metadata.slice(offset, length),
},
}
}
}
Expand Down Expand Up @@ -434,15 +439,17 @@ mod test {
}

#[test]
fn invalid_missing_value() {
fn all_null_missing_value_and_typed_value() {
let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
let array = StructArray::new(fields, vec![make_binary_view_array()], None);
// Should fail because the StructArray does not contain a 'value' field
let err = VariantArray::try_new(Arc::new(array));
assert_eq!(
err.unwrap_err().to_string(),
"Invalid argument error: VariantArray has neither value nor typed_value field"
);
// Should succeed and create an AllNull variant when neither value nor typed_value are present
let variant_array = VariantArray::try_new(Arc::new(array)).unwrap();
Comment on lines +445 to +446
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By a strict reading of the spec, this should actually fail, because this ShreddingState does not represent a shredded object field, but rather represents a top-level variant value:

value typed_value Meaning
null null The value is missing; only valid for shredded object fields

But maybe that's a validation VariantArray::try_new should perform, not ShreddingState::try_new?

Also, it would quickly become annoying if variant_get has to replace a missing or all-null value column with an all-Variant::Null column just to comply with the spec? Maybe that's why there's this additional tidbit?

If a Variant is missing in a context where a value is required, readers must return a Variant null (00): basic type 0 (primitive) and physical type 0 (null). For example, if a Variant is required (like measurement above) and both value and typed_value are null, the returned value must be 00 (Variant null).


// Verify the shredding state is AllNull
assert!(matches!(
variant_array.shredding_state(),
ShreddingState::AllNull { .. }
));
}

#[test]
Expand Down Expand Up @@ -488,4 +495,85 @@ mod test {
fn make_binary_array() -> ArrayRef {
Arc::new(BinaryArray::from(vec![b"test" as &[u8]]))
}

#[test]
fn all_null_shredding_state() {
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]);
let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap();

assert!(matches!(shredding_state, ShreddingState::AllNull { .. }));

// Verify metadata is preserved correctly
if let ShreddingState::AllNull { metadata: m } = shredding_state {
assert_eq!(m.len(), metadata.len());
assert_eq!(m.value(0), metadata.value(0));
}
}

#[test]
fn all_null_variant_array_construction() {
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);
let nulls = NullBuffer::from(vec![false, false, false]); // all null
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are we sure that this case (where there is a value field present, but it is all null) should be treated as though there was no value field?

I haven't double checked the spec (probably the Arrow one) but this feels like it may be out of compliance

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As far as I know, the spec's requirement that "both value and typed_value are null" meaning "the value is missing."
We can see from the spec's example:
refer to here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, I got it. I will add a test for the case.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added a comprehensive test that demonstrates when a value field exists in the schema but contains all null values, it correctly remains in the Unshredded state rather than AllNull.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From what I understand, the treatment of NULL/NULL depends on context:

  • For a top-level variant value, it's interpreted as Variant::Null
  • For a shredded variant object field, it's interpreted as missing (SQL NULL)

So I guess there are two ways to get SQL NULL -- null mask on the struct(value, typed_value), or if both value and typed_value are themselves NULL?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah -- the spec requires that the struct ("group") containing value and typed_value columns must be non-nullable:

The group for each named field must use repetition level required. A field's value and typed_value are set to null (missing) to indicate that the field does not exist in the variant. To encode a field that is present with a [variant/JSON, not SQL] null value, the value must contain a Variant null: basic type 0 (primitive) and physical type 0 (null).

So effectively, the NULL/NULL combo becomes the null mask for that nested field. Which is why a top-level NULL/NULL combo is incorrect -- the top-level field already has its own nullability.


let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]);
let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls));

let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap();

// Verify the shredding state is AllNull
assert!(matches!(
variant_array.shredding_state(),
ShreddingState::AllNull { .. }
));

// Verify all values are null
assert_eq!(variant_array.len(), 3);
assert!(!variant_array.is_valid(0));
assert!(!variant_array.is_valid(1));
assert!(!variant_array.is_valid(2));

// Verify that value() returns Variant::Null for all indices
for i in 0..variant_array.len() {
assert!(
!variant_array.is_valid(i),
"Expected value at index {i} to be null"
);
}
}

#[test]
fn value_field_present_but_all_null_should_be_unshredded() {
// This test demonstrates the issue: when a value field exists in schema
// but all its values are null, it should remain Unshredded, not AllNull
let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]);

// Create a value field with all null values
let value_nulls = NullBuffer::from(vec![false, false, false]); // all null
let value_array = BinaryViewArray::from_iter_values(vec![""; 3]);
let value_data = value_array
.to_data()
.into_builder()
.nulls(Some(value_nulls))
.build()
.unwrap();
let value = BinaryViewArray::from(value_data);

let fields = Fields::from(vec![
Field::new("metadata", DataType::BinaryView, false),
Field::new("value", DataType::BinaryView, true), // Field exists in schema
]);
let struct_array = StructArray::new(
fields,
vec![Arc::new(metadata), Arc::new(value)],
None, // struct itself is not null, just the value field is all null
);

let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap();

// This should be Unshredded, not AllNull, because value field exists in schema
assert!(matches!(
variant_array.shredding_state(),
ShreddingState::Unshredded { .. }
));
}
}
73 changes: 73 additions & 0 deletions parquet-variant-compute/src/variant_get/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result<ArrayRef> {
ShreddingState::Unshredded { metadata, value } => {
output_builder.unshredded(variant_array, metadata, value)
}
ShreddingState::AllNull { metadata } => output_builder.all_null(variant_array, metadata),
}
}

Expand Down Expand Up @@ -284,6 +285,40 @@ mod test {
assert_eq!(&result, &expected)
}

/// AllNull: extract a value as a VariantArray
#[test]
fn get_variant_all_null_as_variant() {
let array = all_null_variant_array();
let options = GetOptions::new();
let result = variant_get(&array, options).unwrap();

// expect the result is a VariantArray
let result: &VariantArray = result.as_any().downcast_ref().unwrap();
assert_eq!(result.len(), 3);

// All values should be null
assert!(!result.is_valid(0));
assert!(!result.is_valid(1));
assert!(!result.is_valid(2));
}

/// AllNull: extract a value as an Int32Array
#[test]
fn get_variant_all_null_as_int32() {
let array = all_null_variant_array();
// specify we want the typed value as Int32
let field = Field::new("typed_value", DataType::Int32, true);
let options = GetOptions::new().with_as_type(Some(FieldRef::from(field)));
let result = variant_get(&array, options).unwrap();

let expected: ArrayRef = Arc::new(Int32Array::from(vec![
Option::<i32>::None,
Option::<i32>::None,
Option::<i32>::None,
]));
assert_eq!(&result, &expected)
}

/// Return a VariantArray that represents a perfectly "shredded" variant
/// for the following example (3 Variant::Int32 values):
///
Expand Down Expand Up @@ -427,4 +462,42 @@ mod test {
StructArray::new(Fields::from(fields), arrays, nulls)
}
}

/// Return a VariantArray that represents an "all null" variant
/// for the following example (3 null values):
///
/// ```text
/// null
/// null
/// null
/// ```
///
/// The schema of the corresponding `StructArray` would look like this:
///
/// ```text
/// StructArray {
/// metadata: BinaryViewArray,
/// }
/// ```
fn all_null_variant_array() -> ArrayRef {
let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() };

let nulls = NullBuffer::from(vec![
false, // row 0 is null
false, // row 1 is null
false, // row 2 is null
]);

// metadata is the same for all rows (though they're all null)
let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3));

let struct_array = StructArrayBuilder::new()
.with_field("metadata", Arc::new(metadata))
.with_nulls(nulls)
.build();

Arc::new(
VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"),
)
}
}
7 changes: 7 additions & 0 deletions parquet-variant-compute/src/variant_get/output/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder {
metadata: &BinaryViewArray,
value_field: &BinaryViewArray,
) -> Result<ArrayRef>;

/// write out an all-null variant array
fn all_null(
&self,
variant_array: &VariantArray,
metadata: &BinaryViewArray,
) -> Result<ArrayRef>;
}

pub(crate) fn instantiate_output_builder<'a>(
Expand Down
16 changes: 14 additions & 2 deletions parquet-variant-compute/src/variant_get/output/primitive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ use crate::VariantArray;
use arrow::error::Result;

use arrow::array::{
Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder,
PrimitiveArray,
new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray,
NullBufferBuilder, PrimitiveArray,
};
use arrow::compute::{cast_with_options, CastOptions};
use arrow::datatypes::Int32Type;
Expand Down Expand Up @@ -155,6 +155,18 @@ impl<'a, T: ArrowPrimitiveVariant> OutputBuilder for PrimitiveOutputBuilder<'a,
"variant_get unshredded to primitive types is not implemented yet",
)))
}

fn all_null(
&self,
variant_array: &VariantArray,
_metadata: &BinaryViewArray,
) -> Result<ArrayRef> {
// For all-null case, create a primitive array with all null values
Ok(Arc::new(new_null_array(
self.as_type.data_type(),
variant_array.len(),
)))
}
}

impl ArrowPrimitiveVariant for Int32Type {
Expand Down
13 changes: 13 additions & 0 deletions parquet-variant-compute/src/variant_get/output/variant.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,4 +143,17 @@ impl<'a> OutputBuilder for VariantOutputBuilder<'a> {

Ok(Arc::new(builder.build()))
}

fn all_null(
&self,
variant_array: &VariantArray,
_metadata: &BinaryViewArray,
) -> arrow::error::Result<ArrayRef> {
// For all-null case, simply create a VariantArray with all null values
let mut builder = VariantArrayBuilder::new(variant_array.len());
for _i in 0..variant_array.len() {
builder.append_null();
}
Ok(Arc::new(builder.build()))
}
}
Loading