From 26e7a5490c7d4001192c3e970b6618f521685bea Mon Sep 17 00:00:00 2001 From: Sebastian Schuster Date: Thu, 22 May 2025 16:56:28 +0200 Subject: [PATCH 1/9] Implement "replace" transform in Field operation module. Implements correct functionality for "replace" attirbute in mlc.Transform. It takes a string of the form "pattern/replacement" and peforms re.sub with `pattern` and `replacement`. Slashes can be escaped using a backslash, e.g, "my\/path/your\/path". --- .../_src/operation_graph/operations/field.py | 10 +++++++++ .../operation_graph/operations/field_test.py | 21 +++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 056899b6e..56e1cd272 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -63,6 +63,16 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: raise ValueError(f"`format` only applies to dates. Got {field.data_type}") elif transform.separator is not None: return value.split(transform.separator) + elif transform.replace is not None: + if isinstance(value, pathlib.PurePath): + value = os.fspath(value) + # split on unescaped slash + parts = re.split(r'(? Date: Thu, 22 May 2025 16:56:28 +0200 Subject: [PATCH 2/9] Implement "replace" transform in Field operation module. Implements correct functionality for "replace" attirbute in mlc.Transform. It takes a string of the form "pattern/replacement" and peforms re.sub with `pattern` and `replacement`. Slashes can be escaped using a backslash, e.g, "my\/path/your\/path". --- .../mlcroissant/_src/operation_graph/operations/field.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 56e1cd272..99449a7b9 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -72,7 +72,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: raise ValueError(f"`replace` must have exactly one unescaped slash. Got {transform.replace} which has {len(parts) - 1} unescaped slashes.") parts = [part.replace('\\/', '/') for part in parts] pattern, replacement = parts - return re.sub(pattern, replacement) + return re.sub(pattern, replacement, value) return value From 8e6378a6aff2905b3d7b02323d2ca431e860539f Mon Sep 17 00:00:00 2001 From: Sebastian Schuster Date: Thu, 22 May 2025 17:41:47 +0200 Subject: [PATCH 3/9] fix linter issues --- .../_src/operation_graph/operations/field.py | 10 +++++++--- .../_src/operation_graph/operations/field_test.py | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 99449a7b9..7640e8c80 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -67,10 +67,14 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: if isinstance(value, pathlib.PurePath): value = os.fspath(value) # split on unescaped slash - parts = re.split(r'(? Date: Thu, 22 May 2025 17:45:01 +0200 Subject: [PATCH 4/9] run reformatter --- .../mlcroissant/_src/operation_graph/operations/field_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 6e9e7d4d0..0d32ccc39 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -386,7 +386,7 @@ def test_extract_lines(separator): "some/new/path/file", False, ], - [ + [ "123", Source(transforms=[Transform(replace=r"[0-9]+([0-9])/abc\1")]), DataType.TEXT, From b4816194d266818d1564793b30f9563e35a9eb55 Mon Sep 17 00:00:00 2001 From: Sebastian Schuster Date: Thu, 22 May 2025 17:47:52 +0200 Subject: [PATCH 5/9] fix one more formatting issue --- .../mlcroissant/_src/operation_graph/operations/field.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 7640e8c80..a100e8eb5 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -147,9 +147,9 @@ def _extract_lines(row: pd.Series) -> pd.Series: """Reads a file line-by-line and outputs a named pd.Series of the lines.""" path = epath.Path(row[FileProperty.filepath]) lines = path.open("rb").read().splitlines() - return pd.Series({ - **row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines)) - }) + return pd.Series( + {**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))} + ) def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame: From 42a721bdf98c22957c0e615e0803ec2e8778c5f4 Mon Sep 17 00:00:00 2001 From: Sebastian Schuster Date: Thu, 22 May 2025 17:52:06 +0200 Subject: [PATCH 6/9] reformat with correct black version --- .../mlcroissant/_src/operation_graph/operations/field.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index a100e8eb5..39130ea1c 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -70,7 +70,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: parts = re.split(r"(? pd.Series: """Reads a file line-by-line and outputs a named pd.Series of the lines.""" path = epath.Path(row[FileProperty.filepath]) lines = path.open("rb").read().splitlines() - return pd.Series( - {**row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines))} - ) + return pd.Series({ + **row, FileProperty.lines: lines, FileProperty.lineNumbers: range(len(lines)) + }) def _extract_value(df: pd.DataFrame, field: Field) -> pd.DataFrame: From c73cae9ba3a5a394e00f0c851c46b604e1121acd Mon Sep 17 00:00:00 2001 From: Sebastian Schuster Date: Tue, 27 May 2025 11:14:27 +0100 Subject: [PATCH 7/9] Add test for exception in replace transformation. --- .../_src/operation_graph/operations/field.py | 2 +- .../operation_graph/operations/field_test.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py index 39130ea1c..c5db18b40 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field.py @@ -66,7 +66,7 @@ def _apply_transform_fn(value: Any, transform: Transform, field: Field) -> Any: elif transform.replace is not None: if isinstance(value, pathlib.PurePath): value = os.fspath(value) - # split on unescaped slash + # Split on unescaped slash. parts = re.split(r"(? Date: Tue, 27 May 2025 11:24:58 +0100 Subject: [PATCH 8/9] add description of "replace" transformation to 1.1 specs. --- docs/croissant-spec-draft.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/croissant-spec-draft.md b/docs/croissant-spec-draft.md index 74109090a..5ddfde654 100644 --- a/docs/croissant-spec-draft.md +++ b/docs/croissant-spec-draft.md @@ -1088,6 +1088,7 @@ Croissant supports a few simple transformations that can be applied on the sourc - delimiter: split a string into an array using the supplied character. - regex: A regular expression to parse the data. - jsonPath: A JSON path to evaluate on the (JSON) data source. +- replace: A string of the form `pattern/replacement` to replace occurrences of `pattern` with `replacement`. Additional forward slashes (`/`) can be escaped with a backslash, and `pattern` can be a regular expression. For example, to extract information from a filename using a regular expression, we can write: From a66f71b13ad2ac2343b965e8f83dc837caa95fd3 Mon Sep 17 00:00:00 2001 From: Sebastian Schuster Date: Tue, 27 May 2025 11:36:13 +0100 Subject: [PATCH 9/9] Remove unnecessary statement in replace test. --- .../mlcroissant/_src/operation_graph/operations/field_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py index 0d086c6c8..e8312a432 100644 --- a/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py +++ b/python/mlcroissant/mlcroissant/_src/operation_graph/operations/field_test.py @@ -424,7 +424,6 @@ def test_apply_replace_exception_fn(replace, unescaped_slashes): field.apply_transforms_fn("foo", f) except ValueError as e: has_error = True - e.__str__ assert ( e.args[0] == "`replace` must have exactly one unescaped slash. "