diff --git a/plugin/action/hash/normalize/token_normalizer.go b/plugin/action/hash/normalize/token_normalizer.go index 526b12331..a6f761986 100644 --- a/plugin/action/hash/normalize/token_normalizer.go +++ b/plugin/action/hash/normalize/token_normalizer.go @@ -208,7 +208,12 @@ func initTokens(lexer *lexmachine.Lexer, addTokens := func(patterns []TokenPattern) { for _, p := range patterns { if p.mask == 0 || builtinPatterns&p.mask != 0 { - lexer.Add([]byte(p.RE), newToken(p.Placeholder)) + switch p.mask { + case pFilepath: + lexer.Add([]byte(p.RE), newFilepathToken(p.Placeholder)) + default: + lexer.Add([]byte(p.RE), newToken(p.Placeholder)) + } } } } @@ -264,6 +269,23 @@ func newToken(placeholder string) lexmachine.Action { } } +func newFilepathToken(placeholder string) lexmachine.Action { + return func(s *lexmachine.Scanner, m *machines.Match) (any, error) { + // skip `\w\w` + if m.TC > 0 && isWord(s.Text[m.TC-1]) || + m.TC+len(m.Bytes) < len(s.Text) && isWord(s.Text[m.TC+len(m.Bytes)]) { + s.TC = m.TC + 1 + return nil, nil + } + + return token{ + placeholder: placeholder, + begin: m.TC, + end: m.TC + len(m.Bytes), + }, nil + } +} + func (n *tokenNormalizer) normalizeByScanner(out []byte, scanner *lexmachine.Scanner) []byte { prevEnd := 0 for tokRaw, err, eos := scanner.Next(); !eos; tokRaw, err, eos = scanner.Next() { @@ -511,10 +533,10 @@ var builtinTokenPatterns = []TokenPattern{ mask: pHash, }, { - // RFC3339, RFC3339Nano, DateTime, DateOnly, TimeOnly, Go time with monotonic clock + // RFC3339, RFC3339Nano, DateTime, DateOnly, TimeOnly, Go time with optional monotonic clock Placeholder: placeholderByPattern[pDatetime], RE: fmt.Sprintf(`(%s)|(%s)|(%s)|(%s)`, - `\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [+\-]\d\d\d\d [A-Z]+ m=[+\-]\d+\.\d+`, + `\d\d\d\d-\d\d-\d\d \d\d:\d\d:\d\d\.\d+ [+\-]\d\d\d\d [A-Z]+( m=[+\-]\d+\.\d+)?`, `\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?(Z|[\+\-]\d\d:\d\d)`, `\d\d:\d\d:\d\d`, `\d\d\d\d-\d\d-\d\d( \d\d:\d\d:\d\d)?`, diff --git a/plugin/action/hash/normalize/token_normalizer_test.go b/plugin/action/hash/normalize/token_normalizer_test.go index 552e8ee8d..08a313eed 100644 --- a/plugin/action/hash/normalize/token_normalizer_test.go +++ b/plugin/action/hash/normalize/token_normalizer_test.go @@ -265,6 +265,8 @@ func TestTokenNormalizerBuiltin(t *testing.T) { "some 2025-01-13 20:58:04.019973588 +0000 UTC m=+1417512.275697914 here", "some 2025-01-13 20:58:04.019973588 -0700 MST m=-123.456789012 here", "some 2025-01-13 20:58:04.019973588 +0300 MSK m=+0.123456789 here", + "some 2025-01-13 20:58:04.019973588 -0700 MST here", + "some 2025-01-13 20:58:04.019973588 +0300 MSK here", "some 2025-01-13T10:20:40Z here", "some 2025-01-13T10:20:40.999999999Z here", "some 2025-01-13T10:20:40-06:00 here", @@ -470,9 +472,9 @@ func TestTokenNormalizerCustom(t *testing.T) { }, }, inputs: []string{ - `2006/01/02 15:04:05 error occurred, client: 10.125.172.251, upstream: "http://10.117.246.15:84/download", host: "mpm-youtube-downloader-38.name.com:84"`, + `2006/01/02 15:04:05 error occurred, client: 10.125.172.251, upstream: "http://10.117.246.15:84/download", host: "mpm-youtube-downloader-38.name.com:84", part/offset: 10117/2461584`, }, - want: " error occurred, client: , upstream: , host: ", + want: " error occurred, client: , upstream: , host: , part/offset: /", }, { name: "empty_patterns", @@ -515,19 +517,21 @@ func TestTokenNormalizerCustom(t *testing.T) { func genBenchInput(count int) []byte { var examples = []string{ - "s1mple falsehood", // no match - "test@host1.host2.com", // email - "http://some.host.com/page1?a=1", // url - "hello-world-123.COM", // host - "7c1811ed-e98f-4c9c-a9f9-58c757ff494f", // uuid - "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", // sha1 - "098f6bcd4621d373cade4e832627b4f6", // md5 - "2025-01-13T10:20:40Z", // datetime - "1.2.3.4", // ip - "-1.2m5s", // duration - "0x13eb85e69dfbc0758b12acdaae36287d", // hex - "-4.56", // float - "123", // int + "48757ec9f04efe7faacec8722f3476339b125a6b6172b8a69ff3aa329e0bd0ff", // hash(sha256) + "a94a8fe5ccb19ba61c4c0873d391e987982fbbd3", // hash(sha1) + "098f6bcd4621d373cade4e832627b4f6", // hash(md5) + "s1mple falsehood", // no match + "test@host1.host2.com", // email + "http://some.host.com/page1?a=1", // url + "hello-world-123.COM", // host + "7c1811ed-e98f-4c9c-a9f9-58c757ff494f", // uuid + "/home/user/photos", // filepath + "2025-01-13T10:20:40Z", // datetime + "1.2.3.4", // ip + "-1.2m5s", // duration + "0x13eb85e69dfbc0758b12acdaae36287d", // hex + "-4.56", // float + "123", // int "truE faLse", }