diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index e715d0a6c05a..c54125894222 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -187,6 +187,13 @@ impl VariantArray { typed_value_to_variant(typed_value, index) } } + ShreddingState::AllNull { .. } => { + // NOTE: This handles the case where neither value nor typed_value fields exist. + // For top-level variants, this returns Variant::Null (JSON null). + // For shredded object fields, this technically should indicate SQL NULL, + // but the current API cannot distinguish these contexts. + Variant::Null + } } } @@ -226,9 +233,6 @@ impl VariantArray { /// [Parquet Variant Shredding Spec]: https://github.com/apache/parquet-format/blob/master/VariantShredding.md#value-shredding #[derive(Debug)] pub enum ShreddingState { - // TODO: add missing state where there is neither value nor typed_value - // https://github.com/apache/arrow-rs/issues/8088 - // Missing { metadata: BinaryViewArray }, /// This variant has no typed_value field Unshredded { metadata: BinaryViewArray, @@ -251,6 +255,13 @@ pub enum ShreddingState { value: BinaryViewArray, typed_value: ArrayRef, }, + /// All values are null, only metadata is present. + /// + /// This state occurs when neither `value` nor `typed_value` fields exist in the schema. + /// Note: By strict spec interpretation, this should only be valid for shredded object fields, + /// not top-level variants. However, we allow it and treat as Variant::Null for pragmatic + /// handling of missing data. + AllNull { metadata: BinaryViewArray }, } impl ShreddingState { @@ -271,9 +282,7 @@ impl ShreddingState { metadata, typed_value, }), - (_metadata_field, None, None) => Err(ArrowError::InvalidArgumentError(String::from( - "VariantArray has neither value nor typed_value field", - ))), + (metadata, None, None) => Ok(Self::AllNull { metadata }), } } @@ -283,6 +292,7 @@ impl ShreddingState { ShreddingState::Unshredded { metadata, .. } => metadata, ShreddingState::Typed { metadata, .. } => metadata, ShreddingState::PartiallyShredded { metadata, .. } => metadata, + ShreddingState::AllNull { metadata } => metadata, } } @@ -292,6 +302,7 @@ impl ShreddingState { ShreddingState::Unshredded { value, .. } => Some(value), ShreddingState::Typed { .. } => None, ShreddingState::PartiallyShredded { value, .. } => Some(value), + ShreddingState::AllNull { .. } => None, } } @@ -301,6 +312,7 @@ impl ShreddingState { ShreddingState::Unshredded { .. } => None, ShreddingState::Typed { typed_value, .. } => Some(typed_value), ShreddingState::PartiallyShredded { typed_value, .. } => Some(typed_value), + ShreddingState::AllNull { .. } => None, } } @@ -327,6 +339,9 @@ impl ShreddingState { value: value.slice(offset, length), typed_value: typed_value.slice(offset, length), }, + ShreddingState::AllNull { metadata } => ShreddingState::AllNull { + metadata: metadata.slice(offset, length), + }, } } } @@ -435,15 +450,27 @@ mod test { } #[test] - fn invalid_missing_value() { + fn all_null_missing_value_and_typed_value() { let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); let array = StructArray::new(fields, vec![make_binary_view_array()], None); - // Should fail because the StructArray does not contain a 'value' field - let err = VariantArray::try_new(Arc::new(array)); - assert_eq!( - err.unwrap_err().to_string(), - "Invalid argument error: VariantArray has neither value nor typed_value field" - ); + + // NOTE: By strict spec interpretation, this case (top-level variant with null/null) + // should be invalid, but we currently allow it and treat it as Variant::Null. + // This is a pragmatic decision to handle missing data gracefully. + let variant_array = VariantArray::try_new(Arc::new(array)).unwrap(); + + // Verify the shredding state is AllNull + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::AllNull { .. } + )); + + // Verify that value() returns Variant::Null (compensating for spec violation) + for i in 0..variant_array.len() { + if variant_array.is_valid(i) { + assert_eq!(variant_array.value(i), parquet_variant::Variant::Null); + } + } } #[test] @@ -489,4 +516,85 @@ mod test { fn make_binary_array() -> ArrayRef { Arc::new(BinaryArray::from(vec![b"test" as &[u8]])) } + + #[test] + fn all_null_shredding_state() { + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]]); + let shredding_state = ShreddingState::try_new(metadata.clone(), None, None).unwrap(); + + assert!(matches!(shredding_state, ShreddingState::AllNull { .. })); + + // Verify metadata is preserved correctly + if let ShreddingState::AllNull { metadata: m } = shredding_state { + assert_eq!(m.len(), metadata.len()); + assert_eq!(m.value(0), metadata.value(0)); + } + } + + #[test] + fn all_null_variant_array_construction() { + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]); + let nulls = NullBuffer::from(vec![false, false, false]); // all null + + let fields = Fields::from(vec![Field::new("metadata", DataType::BinaryView, false)]); + let struct_array = StructArray::new(fields, vec![Arc::new(metadata)], Some(nulls)); + + let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + + // Verify the shredding state is AllNull + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::AllNull { .. } + )); + + // Verify all values are null + assert_eq!(variant_array.len(), 3); + assert!(!variant_array.is_valid(0)); + assert!(!variant_array.is_valid(1)); + assert!(!variant_array.is_valid(2)); + + // Verify that value() returns Variant::Null for all indices + for i in 0..variant_array.len() { + assert!( + !variant_array.is_valid(i), + "Expected value at index {i} to be null" + ); + } + } + + #[test] + fn value_field_present_but_all_null_should_be_unshredded() { + // This test demonstrates the issue: when a value field exists in schema + // but all its values are null, it should remain Unshredded, not AllNull + let metadata = BinaryViewArray::from(vec![b"test" as &[u8]; 3]); + + // Create a value field with all null values + let value_nulls = NullBuffer::from(vec![false, false, false]); // all null + let value_array = BinaryViewArray::from_iter_values(vec![""; 3]); + let value_data = value_array + .to_data() + .into_builder() + .nulls(Some(value_nulls)) + .build() + .unwrap(); + let value = BinaryViewArray::from(value_data); + + let fields = Fields::from(vec![ + Field::new("metadata", DataType::BinaryView, false), + Field::new("value", DataType::BinaryView, true), // Field exists in schema + ]); + let struct_array = StructArray::new( + fields, + vec![Arc::new(metadata), Arc::new(value)], + None, // struct itself is not null, just the value field is all null + ); + + let variant_array = VariantArray::try_new(Arc::new(struct_array)).unwrap(); + + // This should be Unshredded, not AllNull, because value field exists in schema + assert!(matches!( + variant_array.shredding_state(), + ShreddingState::Unshredded { .. } + )); + } } diff --git a/parquet-variant-compute/src/variant_get/mod.rs b/parquet-variant-compute/src/variant_get/mod.rs index cc852bbc32a2..694000279551 100644 --- a/parquet-variant-compute/src/variant_get/mod.rs +++ b/parquet-variant-compute/src/variant_get/mod.rs @@ -58,6 +58,7 @@ pub fn variant_get(input: &ArrayRef, options: GetOptions) -> Result { ShreddingState::Unshredded { metadata, value } => { output_builder.unshredded(variant_array, metadata, value) } + ShreddingState::AllNull { metadata } => output_builder.all_null(variant_array, metadata), } } @@ -284,6 +285,40 @@ mod test { assert_eq!(&result, &expected) } + /// AllNull: extract a value as a VariantArray + #[test] + fn get_variant_all_null_as_variant() { + let array = all_null_variant_array(); + let options = GetOptions::new(); + let result = variant_get(&array, options).unwrap(); + + // expect the result is a VariantArray + let result: &VariantArray = result.as_any().downcast_ref().unwrap(); + assert_eq!(result.len(), 3); + + // All values should be null + assert!(!result.is_valid(0)); + assert!(!result.is_valid(1)); + assert!(!result.is_valid(2)); + } + + /// AllNull: extract a value as an Int32Array + #[test] + fn get_variant_all_null_as_int32() { + let array = all_null_variant_array(); + // specify we want the typed value as Int32 + let field = Field::new("typed_value", DataType::Int32, true); + let options = GetOptions::new().with_as_type(Some(FieldRef::from(field))); + let result = variant_get(&array, options).unwrap(); + + let expected: ArrayRef = Arc::new(Int32Array::from(vec![ + Option::::None, + Option::::None, + Option::::None, + ])); + assert_eq!(&result, &expected) + } + /// Return a VariantArray that represents a perfectly "shredded" variant /// for the following example (3 Variant::Int32 values): /// @@ -427,4 +462,42 @@ mod test { StructArray::new(Fields::from(fields), arrays, nulls) } } + + /// Return a VariantArray that represents an "all null" variant + /// for the following example (3 null values): + /// + /// ```text + /// null + /// null + /// null + /// ``` + /// + /// The schema of the corresponding `StructArray` would look like this: + /// + /// ```text + /// StructArray { + /// metadata: BinaryViewArray, + /// } + /// ``` + fn all_null_variant_array() -> ArrayRef { + let (metadata, _value) = { parquet_variant::VariantBuilder::new().finish() }; + + let nulls = NullBuffer::from(vec![ + false, // row 0 is null + false, // row 1 is null + false, // row 2 is null + ]); + + // metadata is the same for all rows (though they're all null) + let metadata = BinaryViewArray::from_iter_values(std::iter::repeat_n(&metadata, 3)); + + let struct_array = StructArrayBuilder::new() + .with_field("metadata", Arc::new(metadata)) + .with_nulls(nulls) + .build(); + + Arc::new( + VariantArray::try_new(Arc::new(struct_array)).expect("should create variant array"), + ) + } } diff --git a/parquet-variant-compute/src/variant_get/output/mod.rs b/parquet-variant-compute/src/variant_get/output/mod.rs index 245d73cce8db..52a8f5bc0288 100644 --- a/parquet-variant-compute/src/variant_get/output/mod.rs +++ b/parquet-variant-compute/src/variant_get/output/mod.rs @@ -58,6 +58,13 @@ pub(crate) trait OutputBuilder { metadata: &BinaryViewArray, value_field: &BinaryViewArray, ) -> Result; + + /// write out an all-null variant array + fn all_null( + &self, + variant_array: &VariantArray, + metadata: &BinaryViewArray, + ) -> Result; } pub(crate) fn instantiate_output_builder<'a>( diff --git a/parquet-variant-compute/src/variant_get/output/primitive.rs b/parquet-variant-compute/src/variant_get/output/primitive.rs index 496d711c1044..aabc9827a7b7 100644 --- a/parquet-variant-compute/src/variant_get/output/primitive.rs +++ b/parquet-variant-compute/src/variant_get/output/primitive.rs @@ -20,8 +20,8 @@ use crate::VariantArray; use arrow::error::Result; use arrow::array::{ - Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, NullBufferBuilder, - PrimitiveArray, + new_null_array, Array, ArrayRef, ArrowPrimitiveType, AsArray, BinaryViewArray, + NullBufferBuilder, PrimitiveArray, }; use arrow::compute::{cast_with_options, CastOptions}; use arrow::datatypes::Int32Type; @@ -157,6 +157,18 @@ impl OutputBuilder for PrimitiveOutputBuilder<'_, T> { "variant_get unshredded to primitive types is not implemented yet", ))) } + + fn all_null( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + ) -> Result { + // For all-null case, create a primitive array with all null values + Ok(Arc::new(new_null_array( + self.as_type.data_type(), + variant_array.len(), + ))) + } } impl ArrowPrimitiveVariant for Int32Type { diff --git a/parquet-variant-compute/src/variant_get/output/variant.rs b/parquet-variant-compute/src/variant_get/output/variant.rs index 6f2f829b662d..7c8b4da2f5c1 100644 --- a/parquet-variant-compute/src/variant_get/output/variant.rs +++ b/parquet-variant-compute/src/variant_get/output/variant.rs @@ -145,4 +145,17 @@ impl OutputBuilder for VariantOutputBuilder<'_> { Ok(Arc::new(builder.build())) } + + fn all_null( + &self, + variant_array: &VariantArray, + _metadata: &BinaryViewArray, + ) -> arrow::error::Result { + // For all-null case, simply create a VariantArray with all null values + let mut builder = VariantArrayBuilder::new(variant_array.len()); + for _i in 0..variant_array.len() { + builder.append_null(); + } + Ok(Arc::new(builder.build())) + } }