-
Notifications
You must be signed in to change notification settings - Fork 990
Convert RunEndEncoded to Parquet #8069
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,7 +28,7 @@ use crate::util::bit_util::num_required_bits; | |
use crate::util::interner::{Interner, Storage}; | ||
use arrow_array::{ | ||
Array, ArrayAccessor, BinaryArray, BinaryViewArray, DictionaryArray, FixedSizeBinaryArray, | ||
LargeBinaryArray, LargeStringArray, StringArray, StringViewArray, | ||
LargeBinaryArray, LargeStringArray, RunArray, StringArray, StringViewArray, | ||
}; | ||
use arrow_schema::DataType; | ||
|
||
|
@@ -59,6 +59,28 @@ macro_rules! downcast_dict_op { | |
}; | ||
} | ||
|
||
macro_rules! downcast_ree_impl { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Mimicked |
||
($array:ident, $key:ident, $val:ident, $op:expr $(, $arg:expr)*) => {{ | ||
$op($array | ||
.as_any() | ||
.downcast_ref::<RunArray<arrow_array::types::$key>>() | ||
.unwrap() | ||
.downcast::<$val>() | ||
.unwrap()$(, $arg)*) | ||
}}; | ||
} | ||
|
||
macro_rules! downcast_ree_op { | ||
($run_end_field:expr, $val:ident, $array:ident, $op:expr $(, $arg:expr)*) => { | ||
match $run_end_field.data_type() { | ||
DataType::Int16 => downcast_ree_impl!($array, Int16Type, $val, $op$(, $arg)*), | ||
DataType::Int32 => downcast_ree_impl!($array, Int32Type, $val, $op$(, $arg)*), | ||
DataType::Int64 => downcast_ree_impl!($array, Int64Type, $val, $op$(, $arg)*), | ||
_ => unreachable!(), | ||
} | ||
}; | ||
} | ||
|
||
macro_rules! downcast_op { | ||
($data_type:expr, $array:ident, $op:expr $(, $arg:expr)*) => { | ||
match $data_type { | ||
|
@@ -90,6 +112,20 @@ macro_rules! downcast_op { | |
} | ||
d => unreachable!("cannot downcast {} dictionary value to byte array", d), | ||
}, | ||
DataType::RunEndEncoded(run_end, value) => match value.data_type() { | ||
DataType::Utf8 => downcast_ree_op!(run_end, StringArray, $array, $op$(, $arg)*), | ||
DataType::LargeUtf8 => { | ||
downcast_ree_op!(run_end, LargeStringArray, $array, $op$(, $arg)*) | ||
} | ||
DataType::Binary => downcast_ree_op!(run_end, BinaryArray, $array, $op$(, $arg)*), | ||
DataType::LargeBinary => { | ||
downcast_ree_op!(run_end, LargeBinaryArray, $array, $op$(, $arg)*) | ||
} | ||
DataType::FixedSizeBinary(_) => { | ||
downcast_ree_op!(run_end, FixedSizeBinaryArray, $array, $op$(, $arg)*) | ||
} | ||
d => unreachable!("cannot downcast {} run end encoded value to byte array", d), | ||
}, | ||
d => unreachable!("cannot downcast {} to byte array", d), | ||
} | ||
}; | ||
|
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
|
@@ -222,6 +222,10 @@ impl LevelInfoBuilder { | |||||||||
_ => unreachable!(), | ||||||||||
}) | ||||||||||
} | ||||||||||
DataType::RunEndEncoded(_, v) if is_leaf(v.data_type()) => { | ||||||||||
let levels = ArrayLevels::new(parent_ctx, is_nullable, array.clone()); | ||||||||||
Ok(Self::Primitive(levels)) | ||||||||||
} | ||||||||||
Comment on lines
+225
to
+228
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Exactly same as Dictionary above: arrow-rs/parquet/src/arrow/arrow_writer/levels.rs Lines 157 to 160 in 04f217b
|
||||||||||
d => Err(nyi_err!("Datatype {} is not yet supported", d)), | ||||||||||
} | ||||||||||
} | ||||||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1033,15 +1033,15 @@ impl ArrowColumnWriterFactory { | |
|
||
match data_type { | ||
_ if data_type.is_primitive() => out.push(col(leaves.next().unwrap())?), | ||
ArrowDataType::FixedSizeBinary(_) | ArrowDataType::Boolean | ArrowDataType::Null => out.push(col(leaves.next().unwrap())?), | ||
ArrowDataType::FixedSizeBinary(_) | ArrowDataType::Boolean | ArrowDataType::Null => { | ||
out.push(col(leaves.next().unwrap())?) | ||
} | ||
ArrowDataType::LargeBinary | ||
| ArrowDataType::Binary | ||
| ArrowDataType::Utf8 | ||
| ArrowDataType::LargeUtf8 | ||
| ArrowDataType::BinaryView | ||
| ArrowDataType::Utf8View => { | ||
out.push(bytes(leaves.next().unwrap())?) | ||
} | ||
| ArrowDataType::Utf8View => out.push(bytes(leaves.next().unwrap())?), | ||
ArrowDataType::List(f) | ||
| ArrowDataType::LargeList(f) | ||
| ArrowDataType::FixedSizeList(f, _) => { | ||
|
@@ -1058,21 +1058,29 @@ impl ArrowColumnWriterFactory { | |
self.get_arrow_column_writer(f[1].data_type(), props, leaves, out)? | ||
} | ||
_ => unreachable!("invalid map type"), | ||
} | ||
}, | ||
ArrowDataType::Dictionary(_, value_type) => match value_type.as_ref() { | ||
ArrowDataType::Utf8 | ArrowDataType::LargeUtf8 | ArrowDataType::Binary | ArrowDataType::LargeBinary => { | ||
out.push(bytes(leaves.next().unwrap())?) | ||
} | ||
ArrowDataType::Utf8 | ||
| ArrowDataType::LargeUtf8 | ||
| ArrowDataType::Binary | ||
| ArrowDataType::LargeBinary => out.push(bytes(leaves.next().unwrap())?), | ||
ArrowDataType::Utf8View | ArrowDataType::BinaryView => { | ||
out.push(bytes(leaves.next().unwrap())?) | ||
} | ||
ArrowDataType::FixedSizeBinary(_) => { | ||
ArrowDataType::FixedSizeBinary(_) => out.push(bytes(leaves.next().unwrap())?), | ||
_ => out.push(col(leaves.next().unwrap())?), | ||
}, | ||
ArrowDataType::RunEndEncoded(_run_ends, value_type) => match value_type.data_type() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've basically copied what Dictionary does. Not sure if correct! |
||
ArrowDataType::Utf8 | ||
| ArrowDataType::LargeUtf8 | ||
| ArrowDataType::Binary | ||
| ArrowDataType::LargeBinary => out.push(bytes(leaves.next().unwrap())?), | ||
ArrowDataType::Utf8View | ArrowDataType::BinaryView => { | ||
out.push(bytes(leaves.next().unwrap())?) | ||
} | ||
_ => { | ||
out.push(col(leaves.next().unwrap())?) | ||
} | ||
} | ||
ArrowDataType::FixedSizeBinary(_) => out.push(bytes(leaves.next().unwrap())?), | ||
_ => out.push(col(leaves.next().unwrap())?), | ||
}, | ||
_ => return Err(ParquetError::NYI( | ||
format!( | ||
"Attempting to write an Arrow type {data_type:?} to parquet that is not yet implemented" | ||
|
@@ -1166,6 +1174,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi | |
write_primitive(typed, array.values(), levels) | ||
} | ||
}, | ||
ArrowDataType::RunEndEncoded(_run_ends, _value_type) => todo!(), | ||
_ => { | ||
let array = arrow_cast::cast(column, &ArrowDataType::Int32)?; | ||
let array = array.as_primitive::<Int32Type>(); | ||
|
@@ -1248,6 +1257,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi | |
write_primitive(typed, array.values(), levels) | ||
} | ||
}, | ||
ArrowDataType::RunEndEncoded(_run_ends, _values) => todo!(), | ||
_ => { | ||
let array = arrow_cast::cast(column, &ArrowDataType::Int64)?; | ||
let array = array.as_primitive::<Int64Type>(); | ||
|
@@ -1324,6 +1334,7 @@ fn write_leaf(writer: &mut ColumnWriter<'_>, levels: &ArrayLevels) -> Result<usi | |
let array = column.as_primitive::<Float16Type>(); | ||
get_float_16_array_slice(array, indices) | ||
} | ||
ArrowDataType::RunEndEncoded(_run_ends, _values) => todo!(), | ||
_ => { | ||
return Err(ParquetError::NYI( | ||
"Attempting to write an Arrow type that is not yet implemented".to_string(), | ||
|
@@ -4293,4 +4304,50 @@ mod tests { | |
assert_eq!(get_dict_page_size(col0_meta), 1024 * 1024); | ||
assert_eq!(get_dict_page_size(col1_meta), 1024 * 1024 * 4); | ||
} | ||
|
||
#[test] | ||
fn arrow_writer_run_end_encoded() { | ||
// Create a run array of strings | ||
let mut builder = StringRunBuilder::<Int16Type>::new(); | ||
builder.extend( | ||
vec![Some("alpha"); 1000] | ||
.into_iter() | ||
.chain(vec![Some("beta"); 1000]), | ||
); | ||
let run_array: RunArray<Int16Type> = builder.finish(); | ||
println!("run_array type: {:?}", run_array.data_type()); | ||
let schema = Arc::new(Schema::new(vec![Field::new( | ||
"ree", | ||
run_array.data_type().clone(), | ||
run_array.is_nullable(), | ||
)])); | ||
|
||
// Write to parquet | ||
let mut parquet_bytes: Vec<u8> = Vec::new(); | ||
let mut writer = ArrowWriter::try_new(&mut parquet_bytes, schema.clone(), None).unwrap(); | ||
let batch = RecordBatch::try_new(schema.clone(), vec![Arc::new(run_array)]).unwrap(); | ||
writer.write(&batch).unwrap(); | ||
writer.close().unwrap(); | ||
|
||
// Schema of output is plain, not dictionary or REE encoded!! | ||
let expected_schema = Arc::new(Schema::new(vec![Field::new( | ||
"ree", | ||
arrow_schema::DataType::Utf8, | ||
false, | ||
)])); | ||
|
||
// Read from parquet | ||
let bytes = Bytes::from(parquet_bytes); | ||
let reader = ParquetRecordBatchReaderBuilder::try_new(bytes).unwrap(); | ||
assert_eq!(reader.schema(), &expected_schema); | ||
let batches: Vec<_> = reader | ||
.build() | ||
.unwrap() | ||
.collect::<ArrowResult<Vec<_>>>() | ||
.unwrap(); | ||
assert_eq!(batches.len(), 2); | ||
// Count rows in total | ||
let total_rows = batches.iter().map(|b| b.num_rows()).sum::<usize>(); | ||
assert_eq!(total_rows, 2000); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is handled in #7713