Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 20 additions & 13 deletions datafusion/functions-nested/src/array_compact.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,14 +130,22 @@ fn compact_list<O: OffsetSizeTrait>(
field: &Arc<arrow::datatypes::Field>,
) -> Result<ArrayRef> {
let values = list_array.values();

// Fast path: no nulls in values, return input unchanged
if values.null_count() == 0 {
// Use logical nulls so element types without a validity buffer
// (e.g. NullArray) are still treated as null.
let Some(values_nulls) = values.logical_nulls() else {
// Fast path: no validity buffer, no nulls to remove
return Ok(Arc::new(list_array.clone()));
};
let values_null_count = values_nulls.null_count();
if values_null_count == 0 {
// Fast path: validity buffer present but no nulls set
return Ok(Arc::new(list_array.clone()));
}

let list_nulls = list_array.nulls();
let list_offsets = list_array.offsets();
let original_data = values.to_data();
let capacity = original_data.len() - values.null_count();
let capacity = original_data.len() - values_null_count;
let mut offsets = Vec::<O>::with_capacity(list_array.len() + 1);
offsets.push(O::zero());
let mut mutable = MutableArrayData::with_capacities(
Expand All @@ -147,25 +155,25 @@ fn compact_list<O: OffsetSizeTrait>(
);

for row_index in 0..list_array.len() {
if list_array.nulls().is_some_and(|n| n.is_null(row_index)) {
if list_nulls.is_some_and(|n| n.is_null(row_index)) {
offsets.push(offsets[row_index]);
continue;
}

let start = list_array.offsets()[row_index].as_usize();
let end = list_array.offsets()[row_index + 1].as_usize();
let mut copied = 0usize;
let start = list_offsets[row_index].as_usize();
let end = list_offsets[row_index + 1].as_usize();
let row_null_count = values_nulls.slice(start, end - start).null_count();
let kept = (end - start) - row_null_count;

// Batch consecutive non-null elements into single extend() calls
// to reduce per-element overhead. For [1, 2, NULL, 3, 4] this
// produces 2 extend calls (0..2, 3..5) instead of 4 individual ones.
let mut batch_start: Option<usize> = None;
for i in start..end {
if values.is_null(i) {
if values_nulls.is_null(i) {
// Null breaks the current batch — flush it
if let Some(bs) = batch_start {
mutable.extend(0, bs, i);
copied += i - bs;
batch_start = None;
}
} else if batch_start.is_none() {
Expand All @@ -175,17 +183,16 @@ fn compact_list<O: OffsetSizeTrait>(
// Flush any remaining batch after the loop
if let Some(bs) = batch_start {
mutable.extend(0, bs, end);
copied += end - bs;
}

offsets.push(offsets[row_index] + O::usize_as(copied));
offsets.push(offsets[row_index] + O::usize_as(kept));
}

let new_values = make_array(mutable.freeze());
Ok(Arc::new(GenericListArray::<O>::try_new(
Arc::clone(field),
OffsetBuffer::new(offsets.into()),
new_values,
list_array.nulls().cloned(),
list_nulls.cloned(),
)?))
}
17 changes: 17 additions & 0 deletions datafusion/sqllogictest/test_files/array/array_distinct.slt
Original file line number Diff line number Diff line change
Expand Up @@ -137,6 +137,12 @@ select array_compact(arrow_cast([NULL, NULL, NULL], 'List(Int64)'));
----
[]

# all nulls with untyped NULLs (List(Null) inner values are a NullArray)
query ?
select array_compact(make_array(NULL, NULL, NULL));
----
[]

# empty array
query ?
select array_compact([]);
Expand Down Expand Up @@ -167,6 +173,17 @@ select array_compact([make_array(1, 2), NULL, make_array(3, 4)]);
----
[[1, 2], [3, 4]]

# nested array of all-null inner lists: outer elements are non-null, kept as-is
query ?
select array_compact(make_array(make_array(NULL, NULL, NULL), make_array(NULL, NULL)));
----
[[NULL, NULL, NULL], [NULL, NULL]]

query ?
select array_compact(make_array(make_array(NULL, NULL, NULL), make_array(NULL, NULL, NULL)));
----
[[NULL, NULL, NULL], [NULL, NULL, NULL]]

# LargeList
query ?
select array_compact(arrow_cast([1, NULL, 2, NULL, 3], 'LargeList(Int64)'));
Expand Down
Loading