From 26e7a5490c7d4001192c3e970b6618f521685bea Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Thu, 22 May 2025 16:56:28 +0200
Subject: [PATCH 1/9] Implement "replace" transform in Field operation module.

Implements correct functionality for "replace" attirbute in mlc.Transform. It takes a string of the form "pattern/replacement" and peforms re.sub with `pattern` and `replacement`.

Slashes can be escaped using a backslash, e.g, "my\/path/your\/path".
---
 .../_src/operation_graph/operations/field.py  | 10 +++++++++
 .../operation_graph/operations/field_test.py  | 21 +++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
index 056899b6e..56e1cd272 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -63,6 +63,16 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any:
             raise ValueError(f"`format` only applies to dates. Got {field.data_type}")
     elif transform.separator is not None:
         return value.split(transform.separator)
+    elif transform.replace is not None:
+        if isinstance(value, pathlib.PurePath):
+            value = os.fspath(value)
+        # split on unescaped slash
+        parts = re.split(r'(?<!\\)/', transform.replace)
+        if len(parts) != 2:
+            raise ValueError(f"`replace` must have exactly one unescaped slash. Got {transform.replace} which has {len(parts) - 1} unescaped slashes.")
+        parts =  [part.replace('\\/', '/') for part in parts]
+        pattern, replacement = parts
+        return re.sub(pattern, replacement)
     return value
 
 
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
index 4299193b6..5d3df903d 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
@@ -372,6 +372,27 @@ def test_extract_lines(separator):
             "2024-12-10",
             False,
         ],
+        [
+            "foo",
+            Source(transforms=[Transform(replace="foo/bar")]),
+            DataType.TEXT,
+            "bar",
+            False,
+        ],
+        [
+            "a/path/to/a/file",
+            Source(transforms=[Transform(replace=r"a\/path\/to\/a/some\/new\/path")]),
+            DataType.TEXT,
+            "some/new/path/file",
+            False,
+        ],
+                [
+            "123",
+            Source(transforms=[Transform(replace=r"[0-9]+([0-9])/abc\1")]),
+            DataType.TEXT,
+            "abc3",
+            False,
+        ],
     ],
 )
 def test_apply_transforms_fn(value, source, data_type, expected_value, repeated):

From a8508e5eaa7bf766ec99bc29d625cf248818e9c6 Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Thu, 22 May 2025 16:56:28 +0200
Subject: [PATCH 2/9] Implement "replace" transform in Field operation module.

Implements correct functionality for "replace" attirbute in mlc.Transform. It takes a string of the form "pattern/replacement" and peforms re.sub with `pattern` and `replacement`.

Slashes can be escaped using a backslash, e.g, "my\/path/your\/path".
---
 .../mlcroissant/_src/operation_graph/operations/field.py        | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
index 56e1cd272..99449a7b9 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -72,7 +72,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any:
             raise ValueError(f"`replace` must have exactly one unescaped slash. Got {transform.replace} which has {len(parts) - 1} unescaped slashes.")
         parts =  [part.replace('\\/', '/') for part in parts]
         pattern, replacement = parts
-        return re.sub(pattern, replacement)
+        return re.sub(pattern, replacement, value)
     return value
 
 

From 8e6378a6aff2905b3d7b02323d2ca431e860539f Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Thu, 22 May 2025 17:41:47 +0200
Subject: [PATCH 3/9] fix linter issues

---
 .../_src/operation_graph/operations/field.py           | 10 +++++++---
 .../_src/operation_graph/operations/field_test.py      |  4 ++--
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
index 99449a7b9..7640e8c80 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -67,10 +67,14 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any:
         if isinstance(value, pathlib.PurePath):
             value = os.fspath(value)
         # split on unescaped slash
-        parts = re.split(r'(?<!\\)/', transform.replace)
+        parts = re.split(r"(?<!\\)/", transform.replace)
         if len(parts) != 2:
-            raise ValueError(f"`replace` must have exactly one unescaped slash. Got {transform.replace} which has {len(parts) - 1} unescaped slashes.")
-        parts =  [part.replace('\\/', '/') for part in parts]
+            raise ValueError(
+                f"`replace` must have exactly one unescaped slash. "
+                f"Got {transform.replace} which has "
+                f"{len(parts) - 1} unescaped slashes."
+            )
+        parts = [part.replace("\\/", "/") for part in parts]
         pattern, replacement = parts
         return re.sub(pattern, replacement, value)
     return value
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
index 5d3df903d..6e9e7d4d0 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
@@ -380,8 +380,8 @@ def test_extract_lines(separator):
             False,
         ],
         [
-            "a/path/to/a/file",
-            Source(transforms=[Transform(replace=r"a\/path\/to\/a/some\/new\/path")]),
+            "path/to/a/file",
+            Source(transforms=[Transform(replace=r"path\/to\/a/some\/new\/path")]),
             DataType.TEXT,
             "some/new/path/file",
             False,

From c975ee97a38020b581aa725ee333e030d64298e9 Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Thu, 22 May 2025 17:45:01 +0200
Subject: [PATCH 4/9] run reformatter

---
 .../mlcroissant/_src/operation_graph/operations/field_test.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
index 6e9e7d4d0..0d32ccc39 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
@@ -386,7 +386,7 @@ def test_extract_lines(separator):
             "some/new/path/file",
             False,
         ],
-                [
+        [
             "123",
             Source(transforms=[Transform(replace=r"[0-9]+([0-9])/abc\1")]),
             DataType.TEXT,

From b4816194d266818d1564793b30f9563e35a9eb55 Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Thu, 22 May 2025 17:47:52 +0200
Subject: [PATCH 5/9] fix one more formatting issue

---
 .../mlcroissant/_src/operation_graph/operations/field.py    | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
index 7640e8c80..a100e8eb5 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -147,9 +147,9 @@ def _extract_lines(row: pd.Series) -> pd.Series:
     """Reads a file line-by-line and outputs a named pd.Series of the lines."""
     path = epath.Path(row[FileProperty.filepath])
     lines = path.open("rb").read().splitlines()
-    return pd.Series({
-        **row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))
-    })
+    return pd.Series(
+        {**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))}
+    )
 
 
 def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame:

From 42a721bdf98c22957c0e615e0803ec2e8778c5f4 Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Thu, 22 May 2025 17:52:06 +0200
Subject: [PATCH 6/9] reformat with correct black version

---
 .../mlcroissant/_src/operation_graph/operations/field.py  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
index a100e8eb5..39130ea1c 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -70,7 +70,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any:
         parts = re.split(r"(?<!\\)/", transform.replace)
         if len(parts) != 2:
             raise ValueError(
-                f"`replace` must have exactly one unescaped slash. "
+                "`replace` must have exactly one unescaped slash. "
                 f"Got {transform.replace} which has "
                 f"{len(parts) - 1} unescaped slashes."
             )
@@ -147,9 +147,9 @@ def _extract_lines(row: pd.Series) -> pd.Series:
     """Reads a file line-by-line and outputs a named pd.Series of the lines."""
     path = epath.Path(row[FileProperty.filepath])
     lines = path.open("rb").read().splitlines()
-    return pd.Series(
-        {**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))}
-    )
+    return pd.Series({
+        **row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))
+    })
 
 
 def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame:

From c73cae9ba3a5a394e00f0c851c46b604e1121acd Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Tue, 27 May 2025 11:14:27 +0100
Subject: [PATCH 7/9] Add test for exception in replace transformation.

---
 .../_src/operation_graph/operations/field.py  |  2 +-
 .../operation_graph/operations/field_test.py  | 33 +++++++++++++++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
index 39130ea1c..c5db18b40 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py
@@ -66,7 +66,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any:
     elif transform.replace is not None:
         if isinstance(value, pathlib.PurePath):
             value = os.fspath(value)
-        # split on unescaped slash
+        # Split on unescaped slash.
         parts = re.split(r"(?<!\\)/", transform.replace)
         if len(parts) != 2:
             raise ValueError(
diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
index 0d32ccc39..0d086c6c8 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
@@ -402,6 +402,39 @@ def test_apply_transforms_fn(value, source, data_type, expected_value, repeated)
     assert field.apply_transforms_fn(value, f) == expected_value
 
 
+@pytest.mark.parametrize(
+    ["replace", "unescaped_slashes"],
+    [
+        ["two/unescaped/slashes", 2],
+        [r"no\/unescaped\/slashes", 0],
+        [r"f/o/u/r/\/unescaped\/slashes", 4],
+    ],
+)
+def test_apply_replace_exception_fn(replace, unescaped_slashes):
+    f = Field(
+        id="test",
+        name="test",
+        data_types=DataType.TEXT,
+        source=Source(transforms=[Transform(replace=replace)]),
+        repeated=False,
+    )
+
+    has_error = False
+    try:
+        field.apply_transforms_fn("foo", f)
+    except ValueError as e:
+        has_error = True
+        e.__str__
+        assert (
+            e.args[0]
+            == "`replace` must have exactly one unescaped slash. "
+            f"Got {replace} which has "
+            f"{unescaped_slashes} unescaped slashes."
+        )
+
+    assert has_error
+
+
 def test_apply_multiple_transforms_fn():
     source = Source(
         transforms=[

From c1b1c5fee18ea2f4b4798fd37d8f8011ea92c050 Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Tue, 27 May 2025 11:24:58 +0100
Subject: [PATCH 8/9] add description of "replace" transformation to 1.1 specs.

---
 docs/croissant-spec-draft.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/croissant-spec-draft.md b/docs/croissant-spec-draft.md
index 74109090a..5ddfde654 100644
--- a/docs/croissant-spec-draft.md
+++ b/docs/croissant-spec-draft.md
@@ -1088,6 +1088,7 @@ Croissant supports a few simple transformations that can be applied on the sourc
 - delimiter: split a string into an array using the supplied character.
 - regex: A regular expression to parse the data.
 - jsonPath: A JSON path to evaluate on the (JSON) data source.
+- replace: A string of the form `pattern/replacement` to replace occurrences of `pattern` with `replacement`. Additional forward slashes (`/`) can be escaped with a backslash, and `pattern` can be a regular expression.
 
 For example, to extract information from a filename using a regular expression, we can write:
 

From a66f71b13ad2ac2343b965e8f83dc837caa95fd3 Mon Sep 17 00:00:00 2001
From: Sebastian Schuster <sebschu@gmail.com>
Date: Tue, 27 May 2025 11:36:13 +0100
Subject: [PATCH 9/9] Remove unnecessary statement in replace test.

---
 .../mlcroissant/_src/operation_graph/operations/field_test.py    | 1 -
 1 file changed, 1 deletion(-)

diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
index 0d086c6c8..e8312a432 100644
--- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
+++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py
@@ -424,7 +424,6 @@ def test_apply_replace_exception_fn(replace, unescaped_slashes):
         field.apply_transforms_fn("foo", f)
     except ValueError as e:
         has_error = True
-        e.__str__
         assert (
             e.args[0]
             == "`replace` must have exactly one unescaped slash. "