Skip to content

add group capture to replace #1788

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
154 changes: 150 additions & 4 deletions tokenizers/src/normalizers/replace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ use crate::tokenizer::pattern::Pattern;
use crate::tokenizer::Decoder;
use crate::tokenizer::{NormalizedString, Normalizer, Result};
use crate::utils::SysRegex;
use regex::Regex;
use serde::{Deserialize, Serialize};

/// Represents the different patterns that `Replace` can use
Expand Down Expand Up @@ -63,25 +64,52 @@ impl PartialEq for Replace {
}
}

/// Converts backreferences in the regex pattern string
fn convert_backrefs(pattern: &str) -> String {
let re = Regex::new(r"\\(\d+)").unwrap(); // match \1, \2, etc.
let converted = re.replace_all(pattern, |caps: &regex::Captures| {
format!("${{{}}}", &caps[1]) // insert the captured number dynamically
}).to_string();
converted
}


impl Replace {
pub fn new<I: Into<ReplacePattern>, C: Into<String>>(pattern: I, content: C) -> Result<Self> {
let pattern: ReplacePattern = pattern.into();
let regex = match &pattern {
let converted_pattern = match &pattern {
ReplacePattern::String(s) => SysRegex::new(&regex::escape(s))?,
ReplacePattern::Regex(r) => SysRegex::new(r)?,
};

let converted_content = convert_backrefs(&content.into()); // Apply convert_backrefs to content

Ok(Self {
pattern,
content: content.into(),
regex,
content: converted_content,
regex: converted_pattern,
})
}
}

impl Normalizer for Replace {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()> {
normalized.replace(&self.regex, &self.content)
match &self.pattern {
ReplacePattern::Regex(pattern) => {
// Use the regex pattern directly for replacement
let re = Regex::new(pattern)?;
let current_text = normalized.get().to_owned();
let result = re.replace_all(&current_text, &self.content);

// Directly set the normalized string to the result
*normalized = NormalizedString::from(result.as_ref());
Ok(())
}
ReplacePattern::String(_) => {
// Handle simple string replacement
normalized.replace(&self.regex, &self.content)
}
}
}
}

Expand Down Expand Up @@ -134,6 +162,124 @@ mod tests {
assert_eq!(&n.get(), &normalized);
}

#[test]
fn test_replace_with_capture_groups() {
// Test case 1: Simple capture group and backreference
let text = "le travail est totalement pénible";
let normalizer = Replace::new(ReplacePattern::Regex(r"(l)(e)".into()), r"\1 \2").unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
let output = normalized.get();
println!("output is: {}", output);
assert!(output.contains("l e travail est total ement pénibl e"));

// Test case 2: Phone number formatting
let text = "123-456-7890";
let normalizer = Replace::new(
ReplacePattern::Regex(r"(\d{3})-(\d{3})-(\d{4})".into()),
r"(\1) \2-\3"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
let output = normalized.get();
println!("output is111: {}", output);
assert!(output.contains("(123) 456-7890"));

// Test case 3: Greedy matching of repeated characters
let text = "aaaabbbbcccc";
let normalizer = Replace::new(
ReplacePattern::Regex(r"(a+)(b+)(c+)".into()),
r"[$1]-[$2]-[$3]"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("[aaaa]-[bbbb]-[cccc]"));

// Test case 4: Non-greedy match with wildcards
let text = "<p>Some text</p><p>More text</p>";
let normalizer = Replace::new(
ReplacePattern::Regex(r"<p>(.*?)</p>".into()),
r"[P:$1]"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("[P:Some text]"));
assert!(normalized.get().contains("[P:More text]"));

// Test case 5: Unicode capture and replace
let text = "東京 is the capital of 日本";
let normalizer = Replace::new(
ReplacePattern::Regex(r"(東京)".into()),
r"$1 (Tokyo)"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("東京 (Tokyo)"));

// Test case 6: Backreferences with slashes and quotes
let text = "name=\"value\"";
let normalizer = Replace::new(
ReplacePattern::Regex(r#"name="([^"]+)""#.into()),
r"name='$1'"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("name='value'"));

// Test case 7: Replace dollar sign with USD prefix
let text = "Price is $20 and discounted to $15";
let normalizer = Replace::new(
ReplacePattern::Regex(r"\$(\d+)".into()),
r"USD $1"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("USD 20"));
assert!(normalized.get().contains("USD 15"));

// Test case 8: Escape literal dollar signs
let text = "Cost: $5, Tax: $0.50";
let normalizer = Replace::new(
ReplacePattern::Regex(r"\$".into()),
r"\\$"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("\\$5"));
assert!(normalized.get().contains("\\$0.50"));

// Test case 9: Replace dollars with euros
let text = "Item costs $40, not $50";
let normalizer = Replace::new(
ReplacePattern::Regex(r"\$(\d+)".into()),
r"€$1"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("€40"));
assert!(normalized.get().contains("€50"));

// Test case 10: Keep dollar amount but add comma formatting
let text = "That car costs $1000000!";
let normalizer = Replace::new(
ReplacePattern::Regex(r"\$(\d{1,3})(\d{3})(\d{3})".into()),
r"$$1,$2,$3"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("$1,000,000"));

// Test case 11: Dollar sign at end of line (should not match as regex end anchor)
let text = "I love making $";
let normalizer = Replace::new(
ReplacePattern::Regex(r"\$(\d*)".into()),
r"USD\1"
).unwrap();
let mut normalized = NormalizedString::from(text);
normalizer.normalize(&mut normalized).unwrap();
assert!(normalized.get().contains("USD"));
}

#[test]
fn serialization() {
let replace = Replace::new("Hello", "Hey").unwrap();
Expand Down