Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
281 changes: 261 additions & 20 deletions datafusion/physical-expr/src/expressions/binary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1099,6 +1099,41 @@ pub fn binary(
Ok(Arc::new(BinaryExpr::new(lhs, op, rhs)))
}

// Translates a SQL `SIMILAR TO` pattern to a Rust regex. `%` and `_` are
// LIKE-style wildcards (wrapped in `(?s:...)` so they match newlines).
// The POSIX metacharacters `| * + ? ( ) { } [ ]` pass through to the
// regex. `. ^ $ \` are SQL literals and are escaped.
fn sql_similar_to_regex(pattern: &str) -> String {
let mut result = String::with_capacity(pattern.len() + 10);
result.push_str("^(?:");
let mut in_bracket = false;
for ch in pattern.chars() {
match (ch, in_bracket) {
('%', false) => result.push_str("(?s:.*)"),
('_', false) => result.push_str("(?s:.)"),
('[', false) => {
result.push('[');
in_bracket = true;
}
(']', true) => {
result.push(']');
in_bracket = false;
}
// `. ^ $` are SQL literals but regex metachars when not inside
// a `[...]` bracket expression (inside one, regex already treats
// them as literals). `\` is a regex escape character in all
// positions, so it always needs escaping.
('.' | '^' | '$', false) | ('\\', _) => {
result.push('\\');
result.push(ch);
}
(c, _) => result.push(c),
}
}
result.push_str(")$");
result
}

/// Create a similar to expression
pub fn similar_to(
negated: bool,
Expand All @@ -1112,7 +1147,39 @@ pub fn similar_to(
(true, false) => Operator::RegexNotMatch,
(true, true) => Operator::RegexNotIMatch,
};
Ok(Arc::new(BinaryExpr::new(expr, binary_op, pattern)))

let translated_pattern = match pattern.downcast_ref::<crate::expressions::Literal>() {
Some(literal) => match literal.value() {
ScalarValue::Utf8(Some(s)) => Arc::new(crate::expressions::Literal::new(
ScalarValue::Utf8(Some(sql_similar_to_regex(s.as_str()))),
)) as Arc<dyn PhysicalExpr>,
ScalarValue::LargeUtf8(Some(s)) => Arc::new(crate::expressions::Literal::new(
ScalarValue::LargeUtf8(Some(sql_similar_to_regex(s.as_str()))),
)) as Arc<dyn PhysicalExpr>,
ScalarValue::Utf8View(Some(s)) => Arc::new(crate::expressions::Literal::new(
ScalarValue::Utf8View(Some(sql_similar_to_regex(s.as_str()))),
)) as Arc<dyn PhysicalExpr>,
ScalarValue::Utf8(None)
| ScalarValue::LargeUtf8(None)
| ScalarValue::Utf8View(None) => pattern,
other => {
return not_impl_err!(
"SIMILAR TO with a non-string literal pattern is not supported: {other:?}"
);
}
},
None => {
return not_impl_err!(
"SIMILAR TO with a non-literal pattern is not yet supported"
);
}
};

Ok(Arc::new(BinaryExpr::new(
expr,
binary_op,
translated_pattern,
)))
}

#[cfg(test)]
Expand Down Expand Up @@ -4800,25 +4867,17 @@ mod tests {
Ok(())
}

/// Test helper for SIMILAR TO binary operation
fn apply_similar_to(
schema: &SchemaRef,
va: Vec<&str>,
vb: Vec<&str>,
pattern: &str,
negated: bool,
case_insensitive: bool,
expected: &BooleanArray,
) -> Result<()> {
let a = StringArray::from(va);
let b = StringArray::from(vb);
let op = similar_to(
negated,
case_insensitive,
col("a", schema)?,
col("b", schema)?,
)?;
let batch =
RecordBatch::try_new(Arc::clone(schema), vec![Arc::new(a), Arc::new(b)])?;
let op = similar_to(negated, case_insensitive, col("a", schema)?, lit(pattern))?;
let batch = RecordBatch::try_new(Arc::clone(schema), vec![Arc::new(a)])?;
let result = op
.evaluate(&batch)?
.into_array(batch.num_rows())
Expand All @@ -4830,32 +4889,214 @@ mod tests {

#[test]
fn test_similar_to() {
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Utf8, false),
Field::new("b", DataType::Utf8, false),
]));
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));

// `%` matches any sequence; case-sensitive
let expected = [Some(true), Some(false)].iter().collect();
// case-sensitive
apply_similar_to(
&schema,
vec!["hello world", "Hello World"],
vec!["hello.*", "hello.*"],
"hello%",
false,
false,
&expected,
)
.unwrap();
// case-insensitive

// `%` matches any sequence; case-insensitive
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(
&schema,
vec!["hello world", "bye"],
vec!["hello.*", "hello.*"],
"hello%",
false,
true,
&expected,
)
.unwrap();

// `_` matches exactly one character
let expected = [Some(true), Some(false), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["x", "xy", ""], "_", false, false, &expected)
.unwrap();

// Match must cover the entire string (no implicit substring match)
let expected = [Some(false), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["abc", "a"], "a", false, false, &expected)
.unwrap();

// `%` matches zero or more, so the empty string matches.
let expected = [Some(true), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["", "anything"], "%", false, false, &expected)
.unwrap();

// `_` requires exactly one character, so the empty string does not
// match.
let expected = [Some(false), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["", "x"], "_", false, false, &expected).unwrap();

// `%` at the start of the pattern is still anchored: the string
// must end where the trailing literal begins.
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["abc", "abd"], "%c", false, false, &expected)
.unwrap();

// `%` and `_` together: `%` matches zero or more (including the
// empty string), `_` matches exactly one character.
let expected = [Some(true), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["a", "abc"], "a%", false, false, &expected)
.unwrap();
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["axb", "abc"], "a_b", false, false, &expected)
.unwrap();
}

// Regression: regex metacharacters that are NOT SIMILAR TO metacharacters
// (`. ^ $ \`) must be treated as SQL literals. Without escaping, `a.`
// would match any `a` followed by any character (`ab`, `a1`, ...).
#[test]
fn test_similar_to_sql_literal_metachars() {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));

// `.` is a literal, not the regex "any character" operator.
let expected = [Some(true), Some(false), Some(false)].iter().collect();
apply_similar_to(
&schema,
vec!["a.", "ab", "a"],
"a.",
false,
false,
&expected,
)
.unwrap();

// `^` and `$` are literals and only match the literal `^` and `$`.
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["^x$", "x"], r"^x$", false, false, &expected)
.unwrap();

// `\` is a literal backslash (we don't support the ESCAPE clause).
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec![r"a\b", "ab"], r"a\b", false, false, &expected)
.unwrap();
}

// SIMILAR TO borrows POSIX metacharacters from regular expressions:
// `| * + ? ( ) { } [ ]`. The translator passes them through to the
// underlying regex engine.
#[test]
fn test_similar_to_posix_metachars() {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));

// `|` alternation.
let expected = [Some(true), Some(false), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["a", "c", "b"], "a|b", false, false, &expected)
.unwrap();

// `*` zero or more.
let expected = [Some(true), Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["", "aa", "ab"], "a*", false, false, &expected)
.unwrap();

// `+` one or more.
let expected = [Some(false), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["", "aa"], "a+", false, false, &expected).unwrap();

// `?` zero or one.
let expected = [Some(true), Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["", "a", "aa"], "a?", false, false, &expected)
.unwrap();

// `()` grouping.
let expected = [Some(true), Some(true), Some(false)].iter().collect();
apply_similar_to(
&schema,
vec!["ab", "abc", "ac"],
"(ab)c?",
false,
false,
&expected,
)
.unwrap();

// `{m}` exact count.
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["aaa", "aa"], "a{3}", false, false, &expected)
.unwrap();

// `[...]` character class.
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["a", "c"], "[ab]", false, false, &expected)
.unwrap();

// `[^...]` negated character class.
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["c", "a"], "[^ab]", false, false, &expected)
.unwrap();

// `[a-z]` range inside a character class.
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["m", "1"], "[a-z]", false, false, &expected)
.unwrap();
}

// Regression: `%` and `_` must match newlines, matching SQL semantics
// where these wildcards match "any character".
#[test]
fn test_similar_to_wildcards_match_newlines() {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));

// `%` crosses a newline. (`%` also matches zero characters, so `ab`
// matches `a%b` as well.)
let expected = [Some(true), Some(true)].iter().collect();
apply_similar_to(&schema, vec!["a\nb", "ab"], "a%b", false, false, &expected)
.unwrap();

// `_` matches a single newline. (`_` requires exactly one character,
// so `ab` does not match `a_b`.)
let expected = [Some(true), Some(false)].iter().collect();
apply_similar_to(&schema, vec!["a\nb", "ab"], "a_b", false, false, &expected)
.unwrap();
}

#[test]
fn test_similar_to_non_literal_pattern_errors() {
let schema = Arc::new(Schema::new(vec![
Field::new("a", DataType::Utf8, false),
Field::new("b", DataType::Utf8, false),
]));
let err = similar_to(
false,
false,
col("a", &schema).unwrap(),
col("b", &schema).unwrap(),
)
.expect_err("non-literal pattern should error");
assert!(
err.to_string().contains("non-literal pattern"),
"unexpected error message: {err}"
);
}

#[test]
fn test_similar_to_null_pattern() {
let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)]));
let a = StringArray::from(vec!["hello"]);
let op = similar_to(
false,
false,
col("a", &schema).unwrap(),
lit(ScalarValue::Utf8(None)),
)
.unwrap();
let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(a)]).unwrap();
let result = op
.evaluate(&batch)
.unwrap()
.into_array(batch.num_rows())
.unwrap();
let expected: BooleanArray = [None].iter().collect();
assert_eq!(result.as_ref(), &expected);
}

pub fn binary_expr(
Expand Down
Loading