From 63f4e26e8f861a01122a76be48c95bfd94def384 Mon Sep 17 00:00:00 2001 From: Chris Date: Tue, 3 Jun 2025 16:49:19 -0500 Subject: [PATCH] add group capture to replace --- tokenizers/src/normalizers/replace.rs | 154 +++++++++++++++++++++++++- 1 file changed, 150 insertions(+), 4 deletions(-) diff --git a/tokenizers/src/normalizers/replace.rs b/tokenizers/src/normalizers/replace.rs index 565757483..70d030658 100644 --- a/tokenizers/src/normalizers/replace.rs +++ b/tokenizers/src/normalizers/replace.rs @@ -2,6 +2,7 @@ use crate::tokenizer::pattern::Pattern; use crate::tokenizer::Decoder; use crate::tokenizer::{NormalizedString, Normalizer, Result}; use crate::utils::SysRegex; +use regex::Regex; use serde::{Deserialize, Serialize}; /// Represents the different patterns that `Replace` can use @@ -63,25 +64,52 @@ impl PartialEq for Replace { } } +/// Converts backreferences in the regex pattern string +fn convert_backrefs(pattern: &str) -> String { + let re = Regex::new(r"\\(\d+)").unwrap(); // match \1, \2, etc. + let converted = re.replace_all(pattern, |caps: ®ex::Captures| { + format!("${{{}}}", &caps[1]) // insert the captured number dynamically + }).to_string(); + converted +} + + impl Replace { pub fn new, C: Into>(pattern: I, content: C) -> Result { let pattern: ReplacePattern = pattern.into(); - let regex = match &pattern { + let converted_pattern = match &pattern { ReplacePattern::String(s) => SysRegex::new(®ex::escape(s))?, ReplacePattern::Regex(r) => SysRegex::new(r)?, }; + let converted_content = convert_backrefs(&content.into()); // Apply convert_backrefs to content + Ok(Self { pattern, - content: content.into(), - regex, + content: converted_content, + regex: converted_pattern, }) } } impl Normalizer for Replace { fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> { - normalized.replace(&self.regex, &self.content) + match &self.pattern { + ReplacePattern::Regex(pattern) => { + // Use the regex pattern directly for replacement + let re = Regex::new(pattern)?; + let current_text = normalized.get().to_owned(); + let result = re.replace_all(¤t_text, &self.content); + + // Directly set the normalized string to the result + *normalized = NormalizedString::from(result.as_ref()); + Ok(()) + } + ReplacePattern::String(_) => { + // Handle simple string replacement + normalized.replace(&self.regex, &self.content) + } + } } } @@ -134,6 +162,124 @@ mod tests { assert_eq!(&n.get(), &normalized); } + #[test] + fn test_replace_with_capture_groups() { + // Test case 1: Simple capture group and backreference + let text = "le travail est totalement pénible"; + let normalizer = Replace::new(ReplacePattern::Regex(r"(l)(e)".into()), r"\1 \2").unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + let output = normalized.get(); + println!("output is: {}", output); + assert!(output.contains("l e travail est total ement pénibl e")); + + // Test case 2: Phone number formatting + let text = "123-456-7890"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"(\d{3})-(\d{3})-(\d{4})".into()), + r"(\1) \2-\3" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + let output = normalized.get(); + println!("output is111: {}", output); + assert!(output.contains("(123) 456-7890")); + + // Test case 3: Greedy matching of repeated characters + let text = "aaaabbbbcccc"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"(a+)(b+)(c+)".into()), + r"[$1]-[$2]-[$3]" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("[aaaa]-[bbbb]-[cccc]")); + + // Test case 4: Non-greedy match with wildcards + let text = "

Some text

More text

"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"

(.*?)

".into()), + r"[P:$1]" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("[P:Some text]")); + assert!(normalized.get().contains("[P:More text]")); + + // Test case 5: Unicode capture and replace + let text = "東京 is the capital of 日本"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"(東京)".into()), + r"$1 (Tokyo)" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("東京 (Tokyo)")); + + // Test case 6: Backreferences with slashes and quotes + let text = "name=\"value\""; + let normalizer = Replace::new( + ReplacePattern::Regex(r#"name="([^"]+)""#.into()), + r"name='$1'" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("name='value'")); + + // Test case 7: Replace dollar sign with USD prefix + let text = "Price is $20 and discounted to $15"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"\$(\d+)".into()), + r"USD $1" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("USD 20")); + assert!(normalized.get().contains("USD 15")); + + // Test case 8: Escape literal dollar signs + let text = "Cost: $5, Tax: $0.50"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"\$".into()), + r"\\$" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("\\$5")); + assert!(normalized.get().contains("\\$0.50")); + + // Test case 9: Replace dollars with euros + let text = "Item costs $40, not $50"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"\$(\d+)".into()), + r"€$1" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("€40")); + assert!(normalized.get().contains("€50")); + + // Test case 10: Keep dollar amount but add comma formatting + let text = "That car costs $1000000!"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"\$(\d{1,3})(\d{3})(\d{3})".into()), + r"$$1,$2,$3" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("$1,000,000")); + + // Test case 11: Dollar sign at end of line (should not match as regex end anchor) + let text = "I love making $"; + let normalizer = Replace::new( + ReplacePattern::Regex(r"\$(\d*)".into()), + r"USD\1" + ).unwrap(); + let mut normalized = NormalizedString::from(text); + normalizer.normalize(&mut normalized).unwrap(); + assert!(normalized.get().contains("USD")); + } + #[test] fn serialization() { let replace = Replace::new("Hello", "Hey").unwrap();