Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
189ad57
#1654 added regex to is_inconsistent_across_dataset operation
alexfurmenkov Mar 10, 2026
9f2a751
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
alexfurmenkov Mar 10, 2026
6f963d3
#1654 additional tests for regexp in is_inconsistent_across_dataset o…
alexfurmenkov Mar 11, 2026
da36f8f
#1654 added info about regex in is_inconsistent_across_dataset
alexfurmenkov Mar 11, 2026
b812b6d
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
alexfurmenkov Mar 11, 2026
2345bb8
#1654 Operator.md prettier fix
alexfurmenkov Mar 11, 2026
a40bc08
Merge remote-tracking branch 'origin/1654-is-inconsistent-across-data…
alexfurmenkov Mar 11, 2026
bc790b9
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
RamilCDISC Mar 11, 2026
fcee4e7
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
alexfurmenkov Mar 14, 2026
1c1dad9
#1654 Operator.md prettier
alexfurmenkov Mar 16, 2026
32bb396
Update merged schema files with markdown descriptions
Mar 16, 2026
f165f69
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
alexfurmenkov Mar 16, 2026
77855ff
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
RamilCDISC Mar 16, 2026
0444f60
#1654 fix when regex passed as list(from the rule)
alexfurmenkov Mar 17, 2026
a4ae822
#1654 extended tests. fixed app operation in case of bad regex
alexfurmenkov Mar 18, 2026
4c40b8d
#1654 incorrect regex raises error
alexfurmenkov Mar 23, 2026
c420aa6
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
alexfurmenkov Mar 23, 2026
f974fa1
Merge branch 'main' into 1654-is-inconsistent-across-dataset-regex
gerrycampion Mar 23, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions cdisc_rules_engine/check_operators/dataframe_operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,10 @@ def is_complete_date(self, other_value):
def is_inconsistent_across_dataset(self, other_value):
target = other_value.get("target")
comparator = other_value.get("comparator")
regex = other_value.get("regex")
if isinstance(regex, list) and regex:
regex = regex[0]

grouping_cols = []
if isinstance(comparator, str):
if comparator in self.value.columns:
Expand All @@ -1162,6 +1166,22 @@ def is_inconsistent_across_dataset(self, other_value):
grouping_cols.append(col)
df_check = self.value[grouping_cols + [target]].copy()
df_check = df_check.fillna("_NaN_")
if regex:
try:
pattern = re.compile(regex)
except re.error:
raise ValueError(
f"Invalid regex: {regex}. Remove parameter or fix the regex."
)
if pattern.groups == 0:
regex = f"({regex})"
extracted = df_check[target].astype(str).str.extract(regex, expand=True)[0]
df_check[target] = extracted.fillna(df_check[target])
results = self._check_inconsistency(df_check, grouping_cols, target)
return results

@staticmethod
def _check_inconsistency(df_check, grouping_cols: list[Any], target):
results = pd.Series(False, index=df_check.index)
for name, group in df_check.groupby(grouping_cols, dropna=False):
if group[target].nunique() > 1:
Expand Down
2 changes: 1 addition & 1 deletion resources/schema/rule-merged/Operator.json
Original file line number Diff line number Diff line change
Expand Up @@ -462,7 +462,7 @@
"properties": {
"operator": {
"const": "is_inconsistent_across_dataset",
"markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\n```yaml\n- name: \"BGSTRESU\"\n operator: is_inconsistent_across_dataset\n value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n operator: is_inconsistent_across_dataset\n value:\n - \"--TESTCD\"\n - \"--CAT\"\n - \"--SCAT\"\n - \"--SPEC\"\n - \"--METHOD\"\n```\n"
"markdownDescription": "\nChecks if a variable maintains consistent values within groups defined by one or more grouping variables. Groups records by specified value(s) and validates that the target variable maintains the same value within each unique combination of grouping variables. When inconsistency is detected within a group, the operator attempts to identify a majority value. If one value appears more frequently than all others, only the minority records (those not matching the majority value) are flagged. If no single majority exists \u2014 i.e., two or more values are tied for the highest frequency \u2014 all records in that group are flagged.\n\nSingle grouping variable - true if the values of BGSTRESU differ within USUBJID:\n\nIf a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value).\n\n- regex is optional.\n- The pattern must include at least one capture group(or whole regex will be wrapped to capture group).\n- Only the first capture group is used for comparison.\n- If the pattern does not match a value, the original value is used.\n\n```yaml\n- name: \"BGSTRESU\"\n operator: is_inconsistent_across_dataset\n value: \"USUBJID\"\n```\n\nMultiple grouping variables - true if the values of --STRESU differ within each combination of --TESTCD, --CAT, --SCAT, --SPEC, and --METHOD:\n\n```yaml\n- name: \"--STRESU\"\n operator: is_inconsistent_across_dataset\n value:\n - \"--TESTCD\"\n - \"--CAT\"\n - \"--SCAT\"\n - \"--SPEC\"\n - \"--METHOD\"\n```\n"
}
},
"required": ["operator", "value"],
Expand Down
7 changes: 7 additions & 0 deletions resources/schema/rule/Operator.md
Original file line number Diff line number Diff line change
Expand Up @@ -996,6 +996,13 @@ Checks if a variable maintains consistent values within groups defined by one or

Single grouping variable - true if the values of BGSTRESU differ within USUBJID:

If a regex parameter is provided, it is applied to the values of the target variable before the consistency check. The first capture group of the regex is used as the normalized value for comparison. This can be useful when only part of the value should be considered during comparison (for example, comparing only the date portion of a datetime value).

- regex is optional.
- The pattern must include at least one capture group(or whole regex will be wrapped to capture group).
- Only the first capture group is used for comparison.
- If the pattern does not match a value, the original value is used.

```yaml
- name: "BGSTRESU"
operator: is_inconsistent_across_dataset
Expand Down
110 changes: 110 additions & 0 deletions tests/unit/test_check_operators/test_value_set_checks.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,6 +244,116 @@ def test_is_inconsistent_across_dataset(
assert result.equals(df.convert_to_series(expected_result))


@pytest.mark.parametrize(
"values,regex,expected",
[
# regex disabled
(["A", "B"], None, [True, True]),
(["TEST_v1", "TEST_v2"], "", [True, True]),
(["TEST", "TEST"], "", [False, False]),
# regex collapsing values
(["TEST_v1", "TEST_v2"], r"^(TEST)", [False, False]),
(["ABC123", "XYZ123"], r"(\d+)", [False, False]),
(["HEIGHT_cm", "HEIGHT_mm"], r"^(HEIGHT)", [False, False]),
# datetime normalization
(
["2014-09-30T11:09", "2014-09-30T11:07"],
r"^(\d{4}-\d{2}-\d{2})",
[False, False],
),
(["TEST_A", "TEST_B"], r"^(TEST_[A-Z])", [True, True]),
(["SUBJ-001", "SUBJ-002"], r"SUBJ-(\d+)", [True, True]),
(
["2014-09-30T11:09", "2014-09-29T11:07"],
r"^(\d{4}-\d{2}-\d{2})",
[True, True],
),
# regex no capture group
(["ABC", "DEF"], r"^XYZ", [True, True]),
(["TEST_v1", "CONTROL"], r"^(TEST)", [True, True]),
(["A", "B"], r"(.*)", [True, True]),
(["A", None], r"(A)", [True, True]),
([None, None], r"(.*)", [False, False]),
([1, 1], r"(\d+)", [False, False]),
# multiple capture groups
([1, 1], r"(\d+)(\d+)", [False, False]),
# multiple regex
(
[1, 1],
[
r"(\d+)(\d+)",
r"(\d+)(\d+)",
],
[False, False],
),
],
)
def test_is_inconsistent_across_dataset_regex(values, regex, expected):
df = pd.DataFrame(
{
"VISIT": ["WEEK1"] * len(values),
"EPOCH": ["TREATMENT"] * len(values),
"VALUE": values,
}
)

other_value = {
"target": "VALUE",
"comparator": ["VISIT", "EPOCH"],
"regex": regex,
}

obj = DataframeType(
{
"value": df,
}
)
result = obj.is_inconsistent_across_dataset(other_value)

assert result.tolist() == expected


@pytest.mark.parametrize(
"values,regex,expected",
[
(["TEST_v1", "TEST_v2"], "AABB???", [True, True]),
(["TEST_v1", "TEST_v2"], "AA(C(B)A", [True, True]),
(["TEST_v1", "TEST_v2"], "AA(C)B)A", [True, True]),
(["TEST_v1", "TEST_v2"], "\\", [True, True]),
(["TEST_v1", "TEST_v2"], "**", [True, True]),
(["TEST", "TEST"], "AABB???", [False, False]),
(["TEST", "TEST"], "AA(C(B)A", [False, False]),
(["TEST", "TEST"], "AA(C)B)A", [False, False]),
(["TEST", "TEST"], "\\", [False, False]),
(["TEST", "TEST"], "**", [False, False]),
],
)
def test_is_inconsistent_across_dataset_regex_ignores_bad_regex(
values, regex, expected
):
df = pd.DataFrame(
{
"VISIT": ["WEEK1"] * len(values),
"EPOCH": ["TREATMENT"] * len(values),
"VALUE": values,
}
)

other_value = {
"target": "VALUE",
"comparator": ["VISIT", "EPOCH"],
"regex": regex,
}

obj = DataframeType(
{
"value": df,
}
)
with pytest.raises(ValueError):
obj.is_inconsistent_across_dataset(other_value)


@pytest.mark.parametrize(
"target, comparator, dataset_type, expected_result",
[
Expand Down
Loading