diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 6f0b60556a751..87d41cd837ec6 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -1099,6 +1099,41 @@ pub fn binary( Ok(Arc::new(BinaryExpr::new(lhs, op, rhs))) } +// Translates a SQL `SIMILAR TO` pattern to a Rust regex. `%` and `_` are +// LIKE-style wildcards (wrapped in `(?s:...)` so they match newlines). +// The POSIX metacharacters `| * + ? ( ) { } [ ]` pass through to the +// regex. `. ^ $ \` are SQL literals and are escaped. +fn sql_similar_to_regex(pattern: &str) -> String { + let mut result = String::with_capacity(pattern.len() + 10); + result.push_str("^(?:"); + let mut in_bracket = false; + for ch in pattern.chars() { + match (ch, in_bracket) { + ('%', false) => result.push_str("(?s:.*)"), + ('_', false) => result.push_str("(?s:.)"), + ('[', false) => { + result.push('['); + in_bracket = true; + } + (']', true) => { + result.push(']'); + in_bracket = false; + } + // `. ^ $` are SQL literals but regex metachars when not inside + // a `[...]` bracket expression (inside one, regex already treats + // them as literals). `\` is a regex escape character in all + // positions, so it always needs escaping. + ('.' | '^' | '$', false) | ('\\', _) => { + result.push('\\'); + result.push(ch); + } + (c, _) => result.push(c), + } + } + result.push_str(")$"); + result +} + /// Create a similar to expression pub fn similar_to( negated: bool, @@ -1112,7 +1147,39 @@ pub fn similar_to( (true, false) => Operator::RegexNotMatch, (true, true) => Operator::RegexNotIMatch, }; - Ok(Arc::new(BinaryExpr::new(expr, binary_op, pattern))) + + let translated_pattern = match pattern.downcast_ref::() { + Some(literal) => match literal.value() { + ScalarValue::Utf8(Some(s)) => Arc::new(crate::expressions::Literal::new( + ScalarValue::Utf8(Some(sql_similar_to_regex(s.as_str()))), + )) as Arc, + ScalarValue::LargeUtf8(Some(s)) => Arc::new(crate::expressions::Literal::new( + ScalarValue::LargeUtf8(Some(sql_similar_to_regex(s.as_str()))), + )) as Arc, + ScalarValue::Utf8View(Some(s)) => Arc::new(crate::expressions::Literal::new( + ScalarValue::Utf8View(Some(sql_similar_to_regex(s.as_str()))), + )) as Arc, + ScalarValue::Utf8(None) + | ScalarValue::LargeUtf8(None) + | ScalarValue::Utf8View(None) => pattern, + other => { + return not_impl_err!( + "SIMILAR TO with a non-string literal pattern is not supported: {other:?}" + ); + } + }, + None => { + return not_impl_err!( + "SIMILAR TO with a non-literal pattern is not yet supported" + ); + } + }; + + Ok(Arc::new(BinaryExpr::new( + expr, + binary_op, + translated_pattern, + ))) } #[cfg(test)] @@ -4800,25 +4867,17 @@ mod tests { Ok(()) } - /// Test helper for SIMILAR TO binary operation fn apply_similar_to( schema: &SchemaRef, va: Vec<&str>, - vb: Vec<&str>, + pattern: &str, negated: bool, case_insensitive: bool, expected: &BooleanArray, ) -> Result<()> { let a = StringArray::from(va); - let b = StringArray::from(vb); - let op = similar_to( - negated, - case_insensitive, - col("a", schema)?, - col("b", schema)?, - )?; - let batch = - RecordBatch::try_new(Arc::clone(schema), vec![Arc::new(a), Arc::new(b)])?; + let op = similar_to(negated, case_insensitive, col("a", schema)?, lit(pattern))?; + let batch = RecordBatch::try_new(Arc::clone(schema), vec![Arc::new(a)])?; let result = op .evaluate(&batch)? .into_array(batch.num_rows()) @@ -4830,32 +4889,214 @@ mod tests { #[test] fn test_similar_to() { - let schema = Arc::new(Schema::new(vec![ - Field::new("a", DataType::Utf8, false), - Field::new("b", DataType::Utf8, false), - ])); + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + // `%` matches any sequence; case-sensitive let expected = [Some(true), Some(false)].iter().collect(); - // case-sensitive apply_similar_to( &schema, vec!["hello world", "Hello World"], - vec!["hello.*", "hello.*"], + "hello%", false, false, &expected, ) .unwrap(); - // case-insensitive + + // `%` matches any sequence; case-insensitive + let expected = [Some(true), Some(false)].iter().collect(); apply_similar_to( &schema, vec!["hello world", "bye"], - vec!["hello.*", "hello.*"], + "hello%", false, true, &expected, ) .unwrap(); + + // `_` matches exactly one character + let expected = [Some(true), Some(false), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["x", "xy", ""], "_", false, false, &expected) + .unwrap(); + + // Match must cover the entire string (no implicit substring match) + let expected = [Some(false), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["abc", "a"], "a", false, false, &expected) + .unwrap(); + + // `%` matches zero or more, so the empty string matches. + let expected = [Some(true), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["", "anything"], "%", false, false, &expected) + .unwrap(); + + // `_` requires exactly one character, so the empty string does not + // match. + let expected = [Some(false), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["", "x"], "_", false, false, &expected).unwrap(); + + // `%` at the start of the pattern is still anchored: the string + // must end where the trailing literal begins. + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["abc", "abd"], "%c", false, false, &expected) + .unwrap(); + + // `%` and `_` together: `%` matches zero or more (including the + // empty string), `_` matches exactly one character. + let expected = [Some(true), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["a", "abc"], "a%", false, false, &expected) + .unwrap(); + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["axb", "abc"], "a_b", false, false, &expected) + .unwrap(); + } + + // Regression: regex metacharacters that are NOT SIMILAR TO metacharacters + // (`. ^ $ \`) must be treated as SQL literals. Without escaping, `a.` + // would match any `a` followed by any character (`ab`, `a1`, ...). + #[test] + fn test_similar_to_sql_literal_metachars() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + + // `.` is a literal, not the regex "any character" operator. + let expected = [Some(true), Some(false), Some(false)].iter().collect(); + apply_similar_to( + &schema, + vec!["a.", "ab", "a"], + "a.", + false, + false, + &expected, + ) + .unwrap(); + + // `^` and `$` are literals and only match the literal `^` and `$`. + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["^x$", "x"], r"^x$", false, false, &expected) + .unwrap(); + + // `\` is a literal backslash (we don't support the ESCAPE clause). + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec![r"a\b", "ab"], r"a\b", false, false, &expected) + .unwrap(); + } + + // SIMILAR TO borrows POSIX metacharacters from regular expressions: + // `| * + ? ( ) { } [ ]`. The translator passes them through to the + // underlying regex engine. + #[test] + fn test_similar_to_posix_metachars() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + + // `|` alternation. + let expected = [Some(true), Some(false), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["a", "c", "b"], "a|b", false, false, &expected) + .unwrap(); + + // `*` zero or more. + let expected = [Some(true), Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["", "aa", "ab"], "a*", false, false, &expected) + .unwrap(); + + // `+` one or more. + let expected = [Some(false), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["", "aa"], "a+", false, false, &expected).unwrap(); + + // `?` zero or one. + let expected = [Some(true), Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["", "a", "aa"], "a?", false, false, &expected) + .unwrap(); + + // `()` grouping. + let expected = [Some(true), Some(true), Some(false)].iter().collect(); + apply_similar_to( + &schema, + vec!["ab", "abc", "ac"], + "(ab)c?", + false, + false, + &expected, + ) + .unwrap(); + + // `{m}` exact count. + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["aaa", "aa"], "a{3}", false, false, &expected) + .unwrap(); + + // `[...]` character class. + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["a", "c"], "[ab]", false, false, &expected) + .unwrap(); + + // `[^...]` negated character class. + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["c", "a"], "[^ab]", false, false, &expected) + .unwrap(); + + // `[a-z]` range inside a character class. + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["m", "1"], "[a-z]", false, false, &expected) + .unwrap(); + } + + // Regression: `%` and `_` must match newlines, matching SQL semantics + // where these wildcards match "any character". + #[test] + fn test_similar_to_wildcards_match_newlines() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + + // `%` crosses a newline. (`%` also matches zero characters, so `ab` + // matches `a%b` as well.) + let expected = [Some(true), Some(true)].iter().collect(); + apply_similar_to(&schema, vec!["a\nb", "ab"], "a%b", false, false, &expected) + .unwrap(); + + // `_` matches a single newline. (`_` requires exactly one character, + // so `ab` does not match `a_b`.) + let expected = [Some(true), Some(false)].iter().collect(); + apply_similar_to(&schema, vec!["a\nb", "ab"], "a_b", false, false, &expected) + .unwrap(); + } + + #[test] + fn test_similar_to_non_literal_pattern_errors() { + let schema = Arc::new(Schema::new(vec![ + Field::new("a", DataType::Utf8, false), + Field::new("b", DataType::Utf8, false), + ])); + let err = similar_to( + false, + false, + col("a", &schema).unwrap(), + col("b", &schema).unwrap(), + ) + .expect_err("non-literal pattern should error"); + assert!( + err.to_string().contains("non-literal pattern"), + "unexpected error message: {err}" + ); + } + + #[test] + fn test_similar_to_null_pattern() { + let schema = Arc::new(Schema::new(vec![Field::new("a", DataType::Utf8, false)])); + let a = StringArray::from(vec!["hello"]); + let op = similar_to( + false, + false, + col("a", &schema).unwrap(), + lit(ScalarValue::Utf8(None)), + ) + .unwrap(); + let batch = RecordBatch::try_new(Arc::clone(&schema), vec![Arc::new(a)]).unwrap(); + let result = op + .evaluate(&batch) + .unwrap() + .into_array(batch.num_rows()) + .unwrap(); + let expected: BooleanArray = [None].iter().collect(); + assert_eq!(result.as_ref(), &expected); } pub fn binary_expr( diff --git a/datafusion/sqllogictest/test_files/strings.slt b/datafusion/sqllogictest/test_files/strings.slt index 9fa453fa02523..908559021b2a0 100644 --- a/datafusion/sqllogictest/test_files/strings.slt +++ b/datafusion/sqllogictest/test_files/strings.slt @@ -71,25 +71,149 @@ p2 p2e1 p2m1e1 -# SIMILAR TO +# SIMILAR TO with `%` wildcard (zero or more characters) query T rowsort -SELECT s FROM test WHERE s SIMILAR TO 'p[12].*'; +SELECT s FROM test WHERE s SIMILAR TO 'p1%'; ---- p1 p1e1 p1m1e1 + +# SIMILAR TO with `_` wildcard (exactly one character) +query T rowsort +SELECT s FROM test WHERE s SIMILAR TO 'p_'; +---- +p1 p2 -p2e1 -p2m1e1 -# NOT SIMILAR TO +# SIMILAR TO requires full-string match (no implicit substring) query T rowsort -SELECT s FROM test WHERE s NOT SIMILAR TO 'p[12].*'; +SELECT s FROM test WHERE s SIMILAR TO 'p1'; ---- -P1 -P1e1 -P1m1e1 -e1 +p1 + +# SIMILAR TO treats `. ^ $` as SQL literals (not as regex metachars). +query B +SELECT 'a.' SIMILAR TO 'a.'; +---- +true + +query B +SELECT 'ab' SIMILAR TO 'a.'; +---- +false + +query B +SELECT '^x$' SIMILAR TO '^x$'; +---- +true + +query B +SELECT 'x' SIMILAR TO '^x$'; +---- +false + +# SIMILAR TO supports the POSIX metacharacters `| * + ? ( ) { } [ ]`. +query B +SELECT 'a' SIMILAR TO 'a|b'; +---- +true + +query B +SELECT 'c' SIMILAR TO 'a|b'; +---- +false + +query B +SELECT '' SIMILAR TO 'a*'; +---- +true + +query B +SELECT 'aa' SIMILAR TO 'a*'; +---- +true + +query B +SELECT 'ab' SIMILAR TO 'a*'; +---- +false + +query B +SELECT 'a' SIMILAR TO 'a+'; +---- +true + +query B +SELECT '' SIMILAR TO 'a+'; +---- +false + +query B +SELECT '' SIMILAR TO 'a?'; +---- +true + +query B +SELECT 'a' SIMILAR TO 'a?'; +---- +true + +query B +SELECT 'aa' SIMILAR TO 'a?'; +---- +false + +query B +SELECT 'ab' SIMILAR TO '(ab)'; +---- +true + +query B +SELECT 'a' SIMILAR TO 'a{2}'; +---- +false + +query B +SELECT 'aa' SIMILAR TO 'a{2}'; +---- +true + +query B +SELECT 'a' SIMILAR TO '[ab]'; +---- +true + +query B +SELECT 'c' SIMILAR TO '[ab]'; +---- +false + +query B +SELECT 'c' SIMILAR TO '[^ab]'; +---- +true + +query B +SELECT 'a' SIMILAR TO '[^ab]'; +---- +false + +# SIMILAR TO wildcards match newlines. +query B +SELECT 'a' || chr(10) || 'b' SIMILAR TO 'a%b'; +---- +true + +query B +SELECT 'a' || chr(10) || 'b' SIMILAR TO 'a_b'; +---- +true + +query B +SELECT 'ab' SIMILAR TO 'a_b'; +---- +false # NOT LIKE query T rowsort