cdisc-org · gerrycampion · Mar 24, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 11, 2026
diff --git a/cdisc_rules_engine/check_operators/dataframe_operators.py b/cdisc_rules_engine/check_operators/dataframe_operators.py
@@ -1152,6 +1152,10 @@ def is_complete_date(self, other_value):
     def is_inconsistent_across_dataset(self, other_value):
         target = other_value.get("target")
         comparator = other_value.get("comparator")
+        regex = other_value.get("regex")
+        if isinstance(regex, list) and regex:
+            regex = regex[0]
+
         grouping_cols = []
         if isinstance(comparator, str):
             if comparator in self.value.columns:
@@ -1162,6 +1166,22 @@ def is_inconsistent_across_dataset(self, other_value):
                     grouping_cols.append(col)
         df_check = self.value[grouping_cols + [target]].copy()
         df_check = df_check.fillna("_NaN_")
+        if regex:
+            try:
+                pattern = re.compile(regex)
+            except re.error:
+                raise ValueError(
+                    f"Invalid regex: {regex}. Remove parameter or fix the regex."
+                )
+            if pattern.groups == 0:
+                regex = f"({regex})"
+            extracted = df_check[target].astype(str).str.extract(regex, expand=True)[0]
+            df_check[target] = extracted.fillna(df_check[target])
+        results = self._check_inconsistency(df_check, grouping_cols, target)
+        return results
+
+    @staticmethod
+    def _check_inconsistency(df_check, grouping_cols: list[Any], target):
         results = pd.Series(False, index=df_check.index)
         for name, group in df_check.groupby(grouping_cols, dropna=False):
             if group[target].nunique() > 1:

diff --git a/resources/schema/rule-merged/Operator.json b/resources/schema/rule-merged/Operator.json
@@ -462,7 +462,7 @@
       "properties": {
         "operator": {
           "const": "is_inconsistent_across_dataset",
-          "markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\n```yaml\n- name: \"BGSTRESU\"\n  operator: is_inconsistent_across_dataset\n  value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n  operator: is_inconsistent_across_dataset\n  value:\n    - \"--TESTCD\"\n    - \"--CAT\"\n    - \"--SCAT\"\n    - \"--SPEC\"\n    - \"--METHOD\"\n```\n"
+          "markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\nIf a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value).\n\n- regex is optional.\n- The pattern must include at least one capture group(or whole regex will be wrapped to capture group).\n- Only the first capture group is used for comparison.\n- If the pattern does not match a value, the original value is used.\n\n```yaml\n- name: \"BGSTRESU\"\n  operator: is_inconsistent_across_dataset\n  value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n  operator: is_inconsistent_across_dataset\n  value:\n    - \"--TESTCD\"\n    - \"--CAT\"\n    - \"--SCAT\"\n    - \"--SPEC\"\n    - \"--METHOD\"\n```\n"
         }
       },
       "required": ["operator", "value"],

diff --git a/resources/schema/rule/Operator.md b/resources/schema/rule/Operator.md
@@ -996,6 +996,13 @@ Checks if a variable maintains consistent values within groups defined by one or
 
 Single grouping variable - true if the values of BGSTRESU differ within USUBJID:
 
+If a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value).
+
+- regex is optional.
+- The pattern must include at least one capture group(or whole regex will be wrapped to capture group).
+- Only the first capture group is used for comparison.
+- If the pattern does not match a value, the original value is used.
+
 ```yaml
 - name: "BGSTRESU"
   operator: is_inconsistent_across_dataset

diff --git a/tests/unit/test_check_operators/test_value_set_checks.py b/tests/unit/test_check_operators/test_value_set_checks.py
@@ -244,6 +244,116 @@ def test_is_inconsistent_across_dataset(
     assert result.equals(df.convert_to_series(expected_result))
 
 
+@pytest.mark.parametrize(
+    "values,regex,expected",
+    [
+        # regex disabled
+        (["A", "B"], None, [True, True]),
+        (["TEST_v1", "TEST_v2"], "", [True, True]),
+        (["TEST", "TEST"], "", [False, False]),
+        # regex collapsing values
+        (["TEST_v1", "TEST_v2"], r"^(TEST)", [False, False]),
+        (["ABC123", "XYZ123"], r"(\d+)", [False, False]),
+        (["HEIGHT_cm", "HEIGHT_mm"], r"^(HEIGHT)", [False, False]),
+        # datetime normalization
+        (
+            ["2014-09-30T11:09", "2014-09-30T11:07"],
+            r"^(\d{4}-\d{2}-\d{2})",
+            [False, False],
+        ),
+        (["TEST_A", "TEST_B"], r"^(TEST_[A-Z])", [True, True]),
+        (["SUBJ-001", "SUBJ-002"], r"SUBJ-(\d+)", [True, True]),
+        (
+            ["2014-09-30T11:09", "2014-09-29T11:07"],
+            r"^(\d{4}-\d{2}-\d{2})",
+            [True, True],
+        ),
+        # regex no capture group
+        (["ABC", "DEF"], r"^XYZ", [True, True]),
+        (["TEST_v1", "CONTROL"], r"^(TEST)", [True, True]),
+        (["A", "B"], r"(.*)", [True, True]),
+        (["A", None], r"(A)", [True, True]),
+        ([None, None], r"(.*)", [False, False]),
+        ([1, 1], r"(\d+)", [False, False]),
+        # multiple capture groups
+        ([1, 1], r"(\d+)(\d+)", [False, False]),
+        # multiple regex
+        (
+            [1, 1],
+            [
+                r"(\d+)(\d+)",
+                r"(\d+)(\d+)",
+            ],
+            [False, False],
+        ),
+    ],
+)
+def test_is_inconsistent_across_dataset_regex(values, regex, expected):
+    df = pd.DataFrame(
+        {
+            "VISIT": ["WEEK1"] * len(values),
+            "EPOCH": ["TREATMENT"] * len(values),
+            "VALUE": values,
+        }
+    )
+
+    other_value = {
+        "target": "VALUE",
+        "comparator": ["VISIT", "EPOCH"],
+        "regex": regex,
+    }
+
+    obj = DataframeType(
+        {
+            "value": df,
+        }
+    )
+    result = obj.is_inconsistent_across_dataset(other_value)
+
+    assert result.tolist() == expected
+
+
+@pytest.mark.parametrize(
+    "values,regex,expected",
+    [
+        (["TEST_v1", "TEST_v2"], "AABB???", [True, True]),
+        (["TEST_v1", "TEST_v2"], "AA(C(B)A", [True, True]),
+        (["TEST_v1", "TEST_v2"], "AA(C)B)A", [True, True]),
+        (["TEST_v1", "TEST_v2"], "\\", [True, True]),
+        (["TEST_v1", "TEST_v2"], "**", [True, True]),
+        (["TEST", "TEST"], "AABB???", [False, False]),
+        (["TEST", "TEST"], "AA(C(B)A", [False, False]),
+        (["TEST", "TEST"], "AA(C)B)A", [False, False]),
+        (["TEST", "TEST"], "\\", [False, False]),
+        (["TEST", "TEST"], "**", [False, False]),
+    ],
+)
+def test_is_inconsistent_across_dataset_regex_ignores_bad_regex(
+    values, regex, expected
+):
+    df = pd.DataFrame(
+        {
+            "VISIT": ["WEEK1"] * len(values),
+            "EPOCH": ["TREATMENT"] * len(values),
+            "VALUE": values,
+        }
+    )
+
+    other_value = {
+        "target": "VALUE",
+        "comparator": ["VISIT", "EPOCH"],
+        "regex": regex,
+    }
+
+    obj = DataframeType(
+        {
+            "value": df,
+        }
+    )
+    with pytest.raises(ValueError):
+        obj.is_inconsistent_across_dataset(other_value)
+
+
 @pytest.mark.parametrize(
     "target, comparator, dataset_type, expected_result",
     [