@@ -626,134 +626,3 @@ def response_to_json(self, response):
626
626
}
627
627
628
628
return j
629
-
630
- def text_similarity (self , text1 , text2 , normalization_filter = None , similarity_cache = None , truncate = True ):
631
- """
632
- Calculate similarity between two text strings using rapidfuzz with performance optimizations.
633
-
634
- This method compares two text strings and returns a similarity score between 0.0 (completely
635
- different) and 1.0 (identical). It includes several optimizations:
636
- - Fast exact equality check for identical text
637
- - Optional content truncation for large text (>4KB) to improve performance
638
- - Optional caching using xxHash for fast cache key generation (bring your own similarity_cache dict)
639
- - Text normalization filtering to remove dynamic content
640
-
641
- The method is particularly useful for:
642
- - Comparing HTTP response bodies
643
- - Content change detection
644
- - Wildcard detection in web applications
645
- - Deduplication of similar text content
646
-
647
- Args:
648
- text1 (str): First text string to compare
649
- text2 (str): Second text string to compare
650
- normalization_filter (str, optional): String to remove from both texts before comparison.
651
- Useful for removing hostnames, timestamps, or other dynamic content that would skew
652
- similarity calculations.
653
- similarity_cache (dict, optional): Cache dictionary for storing/retrieving similarity results.
654
- Uses xxHash-based keys for fast lookups. If provided, results will be cached to improve
655
- performance on repeated comparisons.
656
- truncate (bool, optional): Whether to truncate large text for performance. Defaults to True.
657
- When enabled, text larger than 4KB is truncated to first 2KB + last 1KB for comparison.
658
-
659
- Returns:
660
- float: Similarity score between 0.0 (completely different) and 1.0 (identical).
661
- Values closer to 1.0 indicate more similar content.
662
-
663
- Examples:
664
- Basic similarity comparison:
665
- >>> similarity = self.helpers.web.text_similarity(text1, text2)
666
- >>> if similarity > 0.8:
667
- >>> print("Texts are very similar")
668
-
669
- With content normalization filtering:
670
- >>> similarity = self.helpers.web.text_similarity(
671
- >>> baseline_text,
672
- >>> probe_text,
673
- >>> normalization_filter="example.com"
674
- >>> )
675
-
676
- With caching for performance:
677
- >>> cache = {}
678
- >>> similarity = self.helpers.web.text_similarity(
679
- >>> text1,
680
- >>> text2,
681
- >>> similarity_cache=cache
682
- >>> )
683
-
684
- Disable truncation for exact comparison:
685
- >>> similarity = self.helpers.web.text_similarity(
686
- >>> text1,
687
- >>> text2,
688
- >>> truncate=False
689
- >>> )
690
-
691
- Performance Notes:
692
- - Text larger than 4KB is automatically truncated to first 2KB + last 1KB for comparison (when truncate=True)
693
- - Exact equality is checked first for optimal performance on identical text
694
- - Cache keys are order-independent (comparing A,B gives same cache key as B,A)
695
- - Disabling truncation may impact performance on very large text but provides more accurate results
696
- """
697
-
698
- # Fastest check: exact equality (very common for identical content)
699
- if text1 == text2 :
700
- return 1.0 # Exactly the same
701
-
702
- from rapidfuzz import fuzz
703
- import xxhash
704
-
705
- # Normalize by removing specified content to eliminate differences
706
- if normalization_filter :
707
- text1 = text1 .replace (normalization_filter , "" )
708
- text2 = text2 .replace (normalization_filter , "" )
709
-
710
- # Create fast hashes for cache key using xxHash
711
- text1_hash = xxhash .xxh64 (text1 .encode () if isinstance (text1 , str ) else text1 ).hexdigest ()
712
- text2_hash = xxhash .xxh64 (text2 .encode () if isinstance (text2 , str ) else text2 ).hexdigest ()
713
-
714
- # Create cache key (order-independent) - include truncate setting in cache key
715
- cache_key = tuple (sorted ([text1_hash , text2_hash ]) + [str (truncate )])
716
-
717
- # Check cache first if provided
718
- if similarity_cache is not None and cache_key in similarity_cache :
719
- return similarity_cache [cache_key ]
720
-
721
- # Calculate similarity with optional truncation for performance
722
- if truncate and (len (text1 ) > 4096 or len (text2 ) > 4096 ):
723
- # Take first 2048 bytes + last 1024 bytes for comparison
724
- text1_truncated = self ._truncate_content_for_similarity (text1 )
725
- text2_truncated = self ._truncate_content_for_similarity (text2 )
726
- similarity = fuzz .ratio (text1_truncated , text2_truncated ) / 100.0
727
- else :
728
- # Use full content for comparison
729
- similarity = fuzz .ratio (text1 , text2 ) / 100.0
730
-
731
- # Cache the result if cache provided
732
- if similarity_cache is not None :
733
- similarity_cache [cache_key ] = similarity
734
-
735
- return similarity
736
-
737
- def _truncate_content_for_similarity (self , content ):
738
- """
739
- Truncate content for similarity comparison to improve performance.
740
-
741
- Truncation rules:
742
- - If content <= 3072 bytes (2048 + 1024): return as-is
743
- - If content > 3072 bytes: return first 2048 bytes + last 1024 bytes
744
-
745
- This captures:
746
- - First 2048 bytes: HTTP headers, HTML head, title, main content start
747
- - Last 1024 bytes: Footers, closing scripts, HTML closing tags
748
- """
749
- content_length = len (content )
750
-
751
- # No truncation needed for smaller content
752
- if content_length <= 3072 :
753
- return content
754
-
755
- # Truncate: first 2048 + last 1024 bytes
756
- first_part = content [:2048 ]
757
- last_part = content [- 1024 :]
758
-
759
- return first_part + last_part
0 commit comments