Skip to content

Refactor DateScrubber to use datetime format strings internally #207

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 95 additions & 43 deletions approvaltests/scrubbers/date_scrubber.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import re
from datetime import datetime
from typing import List, Tuple

from approvaltests.scrubbers import create_regex_scrubber
Expand All @@ -6,83 +8,133 @@

class DateScrubber:
@staticmethod
def get_supported_formats() -> List[Tuple[str, List[str]]]:
def _get_internal_formats() -> List[Tuple[str, List[str], List[str]]]:
"""Returns (datetime_format, parsing_examples, display_examples)."""
return [
("%a %b %d %H:%M:%S", ["Tue May 13 16:30:00"], ["Tue May 13 16:30:00"]),
(
"[a-zA-Z]{3} [a-zA-Z]{3} \\d{2} \\d{2}:\\d{2}:\\d{2}",
["Tue May 13 16:30:00"],
),
(
"[a-zA-Z]{3} [a-zA-Z]{3} \\d{2} \\d{2}:\\d{2}:\\d{2} [a-zA-Z]{3,4} \\d{4}",
["Wed Nov 17 22:28:33 EET 2021"],
),
(
"[a-zA-Z]{3} [a-zA-Z]{3} \\d{2} \\d{4} \\d{2}:\\d{2}:\\d{2}.\\d{3}",
"%a %b %d %Y %H:%M:%S.%f",
["Tue May 13 2014 23:30:00.789000"],
["Tue May 13 2014 23:30:00.789"],
),
(
"[a-zA-Z]{3} [a-zA-Z]{3} \\d{2} \\d{2}:\\d{2}:\\d{2} -\\d{4} \\d{4}",
["Tue May 13 16:30:00 -0800 2014"],
),
(
"\\d{2} [a-zA-Z]{3} \\d{4} \\d{2}:\\d{2}:\\d{2},\\d{3}",
"%d %b %Y %H:%M:%S,%f",
["13 May 2014 23:50:49,999000"],
["13 May 2014 23:50:49,999"],
),
("%H:%M:%S", ["23:30:00"], ["23:30:00"]),
(
"[a-zA-Z]{3} \\d{2}, \\d{4} \\d{2}:\\d{2}:\\d{2} [a-zA-Z]{2} [a-zA-Z]{3}",
["May 13, 2014 11:30:00 PM PST"],
),
("\\d{2}:\\d{2}:\\d{2}", ["23:30:00"]),
(
"\\d{4}/\\d{2}/\\d{2} \\d{2}:\\d{2}:\\d{2}.\\d{2}\\d",
"%Y/%m/%d %H:%M:%S.%f",
["2014/05/13 16:30:59.786000"],
["2014/05/13 16:30:59.786"],
),
("%Y-%m-%dT%H:%M:%SZ", ["2020-09-10T08:07:00Z"], ["2020-09-10T08:07:00Z"]),
(
"\\d{4}-\\d{1,2}-\\d{1,2}T\\d{1,2}:\\d{2}Z",
[
"2020-9-10T08:07Z",
"2020-09-9T08:07Z",
"2020-09-10T8:07Z",
"2020-09-10T08:07Z",
],
),
(
"\\d{4}-\\d{1,2}-\\d{1,2}T\\d{1,2}:\\d{2}:\\d{2}Z",
["2020-09-10T08:07:89Z"],
"%Y-%m-%dT%H:%M:%S.%fZ",
["2020-09-10T01:23:45.678000Z"],
["2020-09-10T01:23:45.678Z"],
),
(
"\\d{4}-\\d{1,2}-\\d{1,2}T\\d{1,2}:\\d{2}\\:\\d{2}\\.\\d{3}Z",
["2020-09-10T01:23:45.678Z"],
"%Y-%m-%d %H:%M:%S.%f",
["2023-07-16 17:39:03.293919"],
["2023-07-16 17:39:03.293919"],
),
(
r"\d{4}-\d{1,2}-\d{1,2}(?:T| )\d{1,2}:\d{2}:\d{2}\.\d{6}",
["2023-07-16 17:39:03.293919", "2023-12-06T11:59:47.090226"],
"%Y-%m-%dT%H:%M:%S.%f",
["2023-12-06T11:59:47.090226"],
["2023-12-06T11:59:47.090226"],
),
("\\d{8}T\\d{6}Z", ["20210505T091112Z"]),
("%Y%m%dT%H%M%SZ", ["20210505T091112Z"], ["20210505T091112Z"]),
(
r"(Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s([0-3]?\d)\s([0-1]\d:[0-5]\d:[0-5]\d)\s(\d{4})",
"%a %b %d %H:%M:%S %Y",
["Tue May 13 16:30:00 2014"],
["Tue May 13 16:30:00 2014", "Wed Dec 11 14:59:44 2024"],
),
(
r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}",
"%Y-%m-%dT%H:%M:%S%z",
["2021-09-10T08:07:00+0300"],
["2021-09-10T08:07:00+03:00", "2021-01-01T00:00:00+00:00"],
),
("%Y%m%d_%H%M%S", ["20250527_125703"], ["20250527_125703"]),
]

def __init__(self, date_regex: str):
self.date_regex = date_regex
@staticmethod
def get_supported_formats() -> List[Tuple[str, List[str]]]:
"""Returns regex patterns and example dates for external API compatibility."""
formats = []
for (
date_format,
parsing_examples,
display_examples,
) in DateScrubber._get_internal_formats():
scrubber = DateScrubber(date_format)
regex_pattern = scrubber.date_regex
formats.append((regex_pattern, display_examples))
return formats

def __init__(self, date_format: str):
self.date_format = date_format
self.date_regex = self._convert_format_to_regex(date_format)

def _convert_format_to_regex(self, date_format: str) -> str:
"""Convert datetime format string to a regex pattern for scrubbing."""
format_to_regex = {
"%a": r"[A-Za-z]{3}", # Abbreviated weekday
"%A": r"[A-Za-z]+", # Full weekday
"%b": r"[A-Za-z]{3}", # Abbreviated month
"%B": r"[A-Za-z]+", # Full month
"%d": r"\d{2}", # Day of month (01-31)
"%H": r"\d{2}", # Hour (00-23)
"%I": r"\d{2}", # Hour (01-12)
"%m": r"\d{2}", # Month (01-12)
"%M": r"\d{2}", # Minute (00-59)
"%p": r"[AP]M", # AM/PM
"%S": r"\d{2}", # Second (00-59)
"%Y": r"\d{4}", # Year (4 digits)
"%y": r"\d{2}", # Year (2 digits)
"%Z": r"[A-Z]{3,4}", # Timezone abbreviation
"%z": r"[+\-]\d{4}", # Timezone offset
"%f": r"\d{6}", # Microsecond (6 digits)
}

# Replace format codes with regex patterns first
regex_pattern = date_format
for format_code, regex in format_to_regex.items():
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (code-quality): Remove unnecessary calls to dict.items when the values are not used (remove-dict-items)

Suggested change
for format_code, regex in format_to_regex.items():
for format_code in format_to_regex:

regex_pattern = regex_pattern.replace(format_code, f"__{format_code[1:]}__")

# Escape special regex characters in the remaining format
regex_pattern = re.escape(regex_pattern)

# Replace placeholders with actual regex patterns
for format_code, regex in format_to_regex.items():
placeholder = f"__{format_code[1:]}__"
escaped_placeholder = re.escape(placeholder)
regex_pattern = regex_pattern.replace(escaped_placeholder, regex)

return regex_pattern
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

suggestion (bug_risk): Regex patterns are not anchored, may match substrings unexpectedly

Anchor the pattern with ^ and $, or use word boundaries, to ensure only complete date tokens are matched.

Suggested change
return regex_pattern
# Anchor the pattern to match the entire string
return f"^{regex_pattern}$"


def scrub(self, date_str: str) -> str:
return create_regex_scrubber(self.date_regex, lambda t: f"<date{t}>")(date_str)

@staticmethod
def get_scrubber_for(example: str) -> Scrubber:
# Build error message with regex patterns for external display
supported = ""
for date_regex, examples in DateScrubber.get_supported_formats():
supported += f" {examples[0]} | {date_regex} \n"
scrubber = DateScrubber(date_regex)
if scrubber.scrub(example) == "<date0>":

# Try to parse with internal datetime formats
for (
date_format,
parsing_examples,
display_examples,
) in DateScrubber._get_internal_formats():
try:
datetime.strptime(example, date_format)
scrubber = DateScrubber(date_format)
return scrubber.scrub
except ValueError:
continue

raise Exception(
f"No match found for '{example}'.\n Feel free to add your date at https://github.com/approvals/ApprovalTests.Python/issues/124 \n Current supported formats are: \n{supported}"
Expand Down
12 changes: 5 additions & 7 deletions tests/scrubbers/test_date_scrubber.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,15 @@


def test_supported_formats() -> None:
supported_formats = DateScrubber.get_supported_formats()
for date_regex, examples in supported_formats:
for example in examples:
assert DateScrubber(date_regex).scrub(example) == "<date0>"
internal_formats = DateScrubber._get_internal_formats()
for date_format, parsing_examples, display_examples in internal_formats:
for example in parsing_examples:
assert DateScrubber(date_format).scrub(example) == "<date0>"


def test_supported_formats_arbitrary_string() -> None:
assert (
DateScrubber("[a-zA-Z]{3} [a-zA-Z]{3} \\d{2} \\d{2}:\\d{2}:\\d{2}").scrub(
"arbitrary string"
)
DateScrubber("%a %b %d %H:%M:%S").scrub("arbitrary string")
== "arbitrary string"
)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
| Example Date | Regex Pattern |
| --- | --- |
| Tue May 13 16:30:00 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{2}:\d{2}:\d{2} |
| Wed Nov 17 22:28:33 EET 2021 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{2}:\d{2}:\d{2} [a-zA-Z]{3,4} \d{4} |
| Tue May 13 2014 23:30:00.789 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{4} \d{2}:\d{2}:\d{2}.\d{3} |
| Tue May 13 16:30:00 -0800 2014 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{2}:\d{2}:\d{2} -\d{4} \d{4} |
| 13 May 2014 23:50:49,999 | \d{2} [a-zA-Z]{3} \d{4} \d{2}:\d{2}:\d{2},\d{3} |
| May 13, 2014 11:30:00 PM PST | [a-zA-Z]{3} \d{2}, \d{4} \d{2}:\d{2}:\d{2} [a-zA-Z]{2} [a-zA-Z]{3} |
| Tue May 13 16:30:00 | [A-Za-z]{3}\ [A-Za-z]{3}\ \d{2}\ \d{2}:\d{2}:\d{2} |
| Tue May 13 2014 23:30:00.789 | [A-Za-z]{3}\ [A-Za-z]{3}\ \d{2}\ \d{4}\ \d{2}:\d{2}:\d{2}\.\d{6} |
| 13 May 2014 23:50:49,999 | \d{2}\ [A-Za-z]{3}\ \d{4}\ \d{2}:\d{2}:\d{2},\d{6} |
| 23:30:00 | \d{2}:\d{2}:\d{2} |
| 2014/05/13 16:30:59.786 | \d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}.\d{2}\d |
| 2020-9-10T08:07Z | \d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{2}Z |
| 2020-09-10T08:07:89Z | \d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{2}:\d{2}Z |
| 2020-09-10T01:23:45.678Z | \d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{2}\:\d{2}\.\d{3}Z |
| 2023-07-16 17:39:03.293919 | \d{4}-\d{1,2}-\d{1,2}(?:T| )\d{1,2}:\d{2}:\d{2}\.\d{6} |
| 20210505T091112Z | \d{8}T\d{6}Z |
| Tue May 13 16:30:00 2014 | (Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s([0-3]?\d)\s([0-1]\d:[0-5]\d:[0-5]\d)\s(\d{4}) |
| 2021-09-10T08:07:00+03:00 | \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2} |
| 2014/05/13 16:30:59.786 | \d{4}/\d{2}/\d{2}\ \d{2}:\d{2}:\d{2}\.\d{6} |
| 2020-09-10T08:07:00Z | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}Z |
| 2020-09-10T01:23:45.678Z | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z |
| 2023-07-16 17:39:03.293919 | \d{4}\-\d{2}\-\d{2}\ \d{2}:\d{2}:\d{2}\.\d{6} |
| 2023-12-06T11:59:47.090226 | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6} |
| 20210505T091112Z | \d{4}\d{2}\d{2}T\d{2}\d{2}\d{2}Z |
| Tue May 13 16:30:00 2014 | [A-Za-z]{3}\ [A-Za-z]{3}\ \d{2}\ \d{2}:\d{2}:\d{2}\ \d{4} |
| 2021-09-10T08:07:00+03:00 | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}[+\-]\d{4} |
| 20250527_125703 | \d{4}\d{2}\d{2}_\d{2}\d{2}\d{2} |
Original file line number Diff line number Diff line change
@@ -1,18 +1,16 @@
Exception: No match found for 'an unsupported format'.
Feel free to add your date at https://github.com/approvals/ApprovalTests.Python/issues/124
Current supported formats are:
Tue May 13 16:30:00 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{2}:\d{2}:\d{2}
Wed Nov 17 22:28:33 EET 2021 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{2}:\d{2}:\d{2} [a-zA-Z]{3,4} \d{4}
Tue May 13 2014 23:30:00.789 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{4} \d{2}:\d{2}:\d{2}.\d{3}
Tue May 13 16:30:00 -0800 2014 | [a-zA-Z]{3} [a-zA-Z]{3} \d{2} \d{2}:\d{2}:\d{2} -\d{4} \d{4}
13 May 2014 23:50:49,999 | \d{2} [a-zA-Z]{3} \d{4} \d{2}:\d{2}:\d{2},\d{3}
May 13, 2014 11:30:00 PM PST | [a-zA-Z]{3} \d{2}, \d{4} \d{2}:\d{2}:\d{2} [a-zA-Z]{2} [a-zA-Z]{3}
Tue May 13 16:30:00 | [A-Za-z]{3}\ [A-Za-z]{3}\ \d{2}\ \d{2}:\d{2}:\d{2}
Tue May 13 2014 23:30:00.789 | [A-Za-z]{3}\ [A-Za-z]{3}\ \d{2}\ \d{4}\ \d{2}:\d{2}:\d{2}\.\d{6}
13 May 2014 23:50:49,999 | \d{2}\ [A-Za-z]{3}\ \d{4}\ \d{2}:\d{2}:\d{2},\d{6}
23:30:00 | \d{2}:\d{2}:\d{2}
2014/05/13 16:30:59.786 | \d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2}.\d{2}\d
2020-9-10T08:07Z | \d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{2}Z
2020-09-10T08:07:89Z | \d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{2}:\d{2}Z
2020-09-10T01:23:45.678Z | \d{4}-\d{1,2}-\d{1,2}T\d{1,2}:\d{2}\:\d{2}\.\d{3}Z
2023-07-16 17:39:03.293919 | \d{4}-\d{1,2}-\d{1,2}(?:T| )\d{1,2}:\d{2}:\d{2}\.\d{6}
20210505T091112Z | \d{8}T\d{6}Z
Tue May 13 16:30:00 2014 | (Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)\s([0-3]?\d)\s([0-1]\d:[0-5]\d:[0-5]\d)\s(\d{4})
2021-09-10T08:07:00+03:00 | \d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\+\d{2}:\d{2}
2014/05/13 16:30:59.786 | \d{4}/\d{2}/\d{2}\ \d{2}:\d{2}:\d{2}\.\d{6}
2020-09-10T08:07:00Z | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}Z
2020-09-10T01:23:45.678Z | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}Z
2023-07-16 17:39:03.293919 | \d{4}\-\d{2}\-\d{2}\ \d{2}:\d{2}:\d{2}\.\d{6}
2023-12-06T11:59:47.090226 | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}\.\d{6}
20210505T091112Z | \d{4}\d{2}\d{2}T\d{2}\d{2}\d{2}Z
Tue May 13 16:30:00 2014 | [A-Za-z]{3}\ [A-Za-z]{3}\ \d{2}\ \d{2}:\d{2}:\d{2}\ \d{4}
2021-09-10T08:07:00+03:00 | \d{4}\-\d{2}\-\d{2}T\d{2}:\d{2}:\d{2}[+\-]\d{4}
20250527_125703 | \d{4}\d{2}\d{2}_\d{2}\d{2}\d{2}
Loading