Skip to content

Commit 620f5e3

Browse files
Implement String View (Utf8View/BinaryView) Optimizations
Introduces a two-stage filter for ByteView types. Stage 1 uses a fast DirectProbeFilter on masked views (len + prefix) for quick rejection; Stage 2 performs full verification only for potential long-string matches. Triggers for Utf8View and BinaryView.
1 parent 0111ce5 commit 620f5e3

4 files changed

Lines changed: 461 additions & 7 deletions

File tree

datafusion/physical-expr/src/expressions/in_list/primitive_filter.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,12 @@ where
376376
Ok(Self::from_unique_values(unique_values, arr.null_count()))
377377
}
378378

379+
/// Creates a DirectProbeFilter from pre-processed values.
380+
pub(super) fn from_values(values: impl Iterator<Item = T::Native>) -> Self {
381+
let unique_values: HashSet<_> = values.collect();
382+
Self::from_unique_values(unique_values, 0)
383+
}
384+
379385
fn from_unique_values(unique_values: HashSet<T::Native>, null_count: usize) -> Self {
380386
// Size table to ~25% load factor for fewer collisions
381387
let n = unique_values.len().max(1);
@@ -410,7 +416,7 @@ where
410416
///
411417
/// Returns true if the value is in the set.
412418
#[inline(always)]
413-
fn contains_single(&self, needle: T::Native) -> bool {
419+
pub(super) fn contains_single(&self, needle: T::Native) -> bool {
414420
let mut slot = needle.probe_hash() & self.mask;
415421
loop {
416422
// SAFETY: `slot` is always < table.len() because:

datafusion/physical-expr/src/expressions/in_list/result.rs

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,69 @@ where
5858
build_result_from_contains(needle_nulls, haystack_has_nulls, negated, contains_buf)
5959
}
6060

61+
/// Builds a BooleanArray result while skipping contains checks for null needles.
62+
///
63+
/// This is useful when `contains` is expensive, for example when a string path
64+
/// may need hashing and full byte comparison. If there are no actual nulls, it
65+
/// falls back to the same branch-free contains collection as
66+
/// `build_in_list_result`.
67+
#[inline]
68+
pub(crate) fn build_in_list_result_with_null_shortcircuit<C>(
69+
len: usize,
70+
needle_nulls: Option<&NullBuffer>,
71+
needle_null_count: usize,
72+
haystack_has_nulls: bool,
73+
negated: bool,
74+
mut contains: C,
75+
) -> BooleanArray
76+
where
77+
C: FnMut(usize) -> bool,
78+
{
79+
let effective_nulls = needle_nulls.filter(|_| needle_null_count > 0);
80+
81+
let contains_buf = match effective_nulls {
82+
Some(nulls) => {
83+
BooleanBuffer::collect_bool(len, |i| nulls.is_valid(i) && contains(i))
84+
}
85+
None => BooleanBuffer::collect_bool(len, contains),
86+
};
87+
88+
build_result_from_premasked_contains(
89+
effective_nulls,
90+
haystack_has_nulls,
91+
negated,
92+
contains_buf,
93+
)
94+
}
95+
96+
/// Builds a result from a contains buffer that is already false at null needles.
97+
#[inline]
98+
fn build_result_from_premasked_contains(
99+
needle_nulls: Option<&NullBuffer>,
100+
haystack_has_nulls: bool,
101+
negated: bool,
102+
contains_buf: BooleanBuffer,
103+
) -> BooleanArray {
104+
match (needle_nulls, haystack_has_nulls, negated) {
105+
(_, true, false) => {
106+
BooleanArray::new(contains_buf.clone(), Some(NullBuffer::new(contains_buf)))
107+
}
108+
(Some(v), true, true) => BooleanArray::new(
109+
v.inner() ^ &contains_buf,
110+
Some(NullBuffer::new(contains_buf)),
111+
),
112+
(None, true, true) => {
113+
BooleanArray::new(!&contains_buf, Some(NullBuffer::new(contains_buf)))
114+
}
115+
(Some(v), false, false) => BooleanArray::new(contains_buf, Some(v.clone())),
116+
(Some(v), false, true) => {
117+
BooleanArray::new(v.inner() & &(!&contains_buf), Some(v.clone()))
118+
}
119+
(None, false, false) => BooleanArray::new(contains_buf, None),
120+
(None, false, true) => BooleanArray::new(!&contains_buf, None),
121+
}
122+
}
123+
61124
/// Builds a BooleanArray result from a pre-computed contains buffer.
62125
///
63126
/// This version does not assume contains_buf is pre-masked at null positions.

datafusion/physical-expr/src/expressions/in_list/strategy.rs

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,9 @@ use super::array_static_filter::ArrayStaticFilter;
2626
use super::primitive_filter::*;
2727
use super::static_filter::{StaticFilter, handle_dictionary};
2828
use super::transform::{
29-
make_bitmap_filter, make_branchless_filter, reinterpret_any_primitive_to,
29+
make_bitmap_filter, make_branchless_filter, make_byte_view_masked_filter,
30+
make_utf8view_branchless_filter, make_utf8view_hash_filter,
31+
reinterpret_any_primitive_to, utf8view_all_short_strings,
3032
};
3133

3234
/// Maximum list size for branchless lookup on 4-byte primitives (Int32, UInt32, Float32).
@@ -97,6 +99,16 @@ pub(super) fn instantiate_static_filter(
9799

98100
let len = in_array.len();
99101
let dt = in_array.data_type();
102+
103+
// Special case: Utf8View with short strings can be reinterpreted as i128
104+
if matches!(dt, DataType::Utf8View) && utf8view_all_short_strings(in_array.as_ref()) {
105+
return if len <= BRANCHLESS_MAX_16B {
106+
make_utf8view_branchless_filter(&in_array)
107+
} else {
108+
make_utf8view_hash_filter(&in_array)
109+
};
110+
}
111+
100112
let strategy = select_strategy(dt, len);
101113

102114
match (dt, strategy) {
@@ -117,6 +129,14 @@ pub(super) fn instantiate_static_filter(
117129
exec_datafusion_err!("Hashed strategy selected but no filter for {:?}", dt)
118130
})?,
119131

132+
// Byte view filters (Utf8View, BinaryView)
133+
(DataType::Utf8View, Generic) => {
134+
make_byte_view_masked_filter::<StringViewType>(in_array)
135+
}
136+
(DataType::BinaryView, Generic) => {
137+
make_byte_view_masked_filter::<BinaryViewType>(in_array)
138+
}
139+
120140
// Fallback for nested/complex types and strings.
121141
(_, Generic) => Ok(Arc::new(ArrayStaticFilter::try_new(in_array)?)),
122142
}

0 commit comments

Comments
 (0)