change to simhash for comparison

liquidsec · liquidsec · commit eb7f570e9ca6 · 2025-09-26T12:30:16.000-04:00
diff --git a/bbot/core/helpers/helper.py b/bbot/core/helpers/helper.py
@@ -15,6 +15,7 @@
 from .interactsh import Interactsh
 from .yara_helper import YaraHelper
 from .depsinstaller import DepsInstaller
+from .simhash import SimHash
 from .async_helpers import get_event_loop
 
 from bbot.scanner.target import BaseTarget
@@ -91,6 +92,7 @@ def __init__(self, preset):
         self._dns = None
         self._web = None
         self._asn = None
+        self._simhash = None
         self.config_aware_validators = self.validators.Validators(self)
         self.depsinstaller = DepsInstaller(self)
         self.word_cloud = WordCloud(self)
@@ -114,6 +116,12 @@ def asn(self):
             self._asn = ASNHelper(self)
         return self._asn
 
+    @property
+    def simhash(self):
+        if self._simhash is None:
+            self._simhash = SimHash()
+        return self._simhash
+
     @property
     def cloud(self):
         if self._cloud is None:
diff --git a/bbot/core/helpers/web/web.py b/bbot/core/helpers/web/web.py
@@ -626,134 +626,3 @@ def response_to_json(self, response):
         }
 
         return j
-
-    def text_similarity(self, text1, text2, normalization_filter=None, similarity_cache=None, truncate=True):
-        """
-        Calculate similarity between two text strings using rapidfuzz with performance optimizations.
-
-        This method compares two text strings and returns a similarity score between 0.0 (completely
-        different) and 1.0 (identical). It includes several optimizations:
-        - Fast exact equality check for identical text
-        - Optional content truncation for large text (>4KB) to improve performance
-        - Optional caching using xxHash for fast cache key generation (bring your own similarity_cache dict)
-        - Text normalization filtering to remove dynamic content
-
-        The method is particularly useful for:
-        - Comparing HTTP response bodies
-        - Content change detection
-        - Wildcard detection in web applications
-        - Deduplication of similar text content
-
-        Args:
-            text1 (str): First text string to compare
-            text2 (str): Second text string to compare
-            normalization_filter (str, optional): String to remove from both texts before comparison.
-                Useful for removing hostnames, timestamps, or other dynamic content that would skew
-                similarity calculations.
-            similarity_cache (dict, optional): Cache dictionary for storing/retrieving similarity results.
-                Uses xxHash-based keys for fast lookups. If provided, results will be cached to improve
-                performance on repeated comparisons.
-            truncate (bool, optional): Whether to truncate large text for performance. Defaults to True.
-                When enabled, text larger than 4KB is truncated to first 2KB + last 1KB for comparison.
-
-        Returns:
-            float: Similarity score between 0.0 (completely different) and 1.0 (identical).
-                Values closer to 1.0 indicate more similar content.
-
-        Examples:
-            Basic similarity comparison:
-            >>> similarity = self.helpers.web.text_similarity(text1, text2)
-            >>> if similarity > 0.8:
-            >>>     print("Texts are very similar")
-
-            With content normalization filtering:
-            >>> similarity = self.helpers.web.text_similarity(
-            >>>     baseline_text,
-            >>>     probe_text,
-            >>>     normalization_filter="example.com"
-            >>> )
-
-            With caching for performance:
-            >>> cache = {}
-            >>> similarity = self.helpers.web.text_similarity(
-            >>>     text1,
-            >>>     text2,
-            >>>     similarity_cache=cache
-            >>> )
-
-            Disable truncation for exact comparison:
-            >>> similarity = self.helpers.web.text_similarity(
-            >>>     text1,
-            >>>     text2,
-            >>>     truncate=False
-            >>> )
-
-        Performance Notes:
-            - Text larger than 4KB is automatically truncated to first 2KB + last 1KB for comparison (when truncate=True)
-            - Exact equality is checked first for optimal performance on identical text
-            - Cache keys are order-independent (comparing A,B gives same cache key as B,A)
-            - Disabling truncation may impact performance on very large text but provides more accurate results
-        """
-
-        # Fastest check: exact equality (very common for identical content)
-        if text1 == text2:
-            return 1.0  # Exactly the same
-
-        from rapidfuzz import fuzz
-        import xxhash
-
-        # Normalize by removing specified content to eliminate differences
-        if normalization_filter:
-            text1 = text1.replace(normalization_filter, "")
-            text2 = text2.replace(normalization_filter, "")
-
-        # Create fast hashes for cache key using xxHash
-        text1_hash = xxhash.xxh64(text1.encode() if isinstance(text1, str) else text1).hexdigest()
-        text2_hash = xxhash.xxh64(text2.encode() if isinstance(text2, str) else text2).hexdigest()
-
-        # Create cache key (order-independent) - include truncate setting in cache key
-        cache_key = tuple(sorted([text1_hash, text2_hash]) + [str(truncate)])
-
-        # Check cache first if provided
-        if similarity_cache is not None and cache_key in similarity_cache:
-            return similarity_cache[cache_key]
-
-        # Calculate similarity with optional truncation for performance
-        if truncate and (len(text1) > 4096 or len(text2) > 4096):
-            # Take first 2048 bytes + last 1024 bytes for comparison
-            text1_truncated = self._truncate_content_for_similarity(text1)
-            text2_truncated = self._truncate_content_for_similarity(text2)
-            similarity = fuzz.ratio(text1_truncated, text2_truncated) / 100.0
-        else:
-            # Use full content for comparison
-            similarity = fuzz.ratio(text1, text2) / 100.0
-
-        # Cache the result if cache provided
-        if similarity_cache is not None:
-            similarity_cache[cache_key] = similarity
-
-        return similarity
-
-    def _truncate_content_for_similarity(self, content):
-        """
-        Truncate content for similarity comparison to improve performance.
-
-        Truncation rules:
-        - If content <= 3072 bytes (2048 + 1024): return as-is
-        - If content > 3072 bytes: return first 2048 bytes + last 1024 bytes
-
-        This captures:
-        - First 2048 bytes: HTTP headers, HTML head, title, main content start
-        - Last 1024 bytes: Footers, closing scripts, HTML closing tags
-        """
-        content_length = len(content)
-
-        # No truncation needed for smaller content
-        if content_length <= 3072:
-            return content
-
-        # Truncate: first 2048 + last 1024 bytes
-        first_part = content[:2048]
-        last_part = content[-1024:]
-
-        return first_part + last_part
diff --git a/bbot/modules/virtualhost.py b/bbot/modules/virtualhost.py
@@ -12,10 +12,9 @@ class virtualhost(BaseModule):
     flags = ["active", "aggressive", "slow", "deadly"]
     meta = {"description": "Fuzz for virtual hosts", "created_date": "2022-05-02", "author": "@liquidsec"}
 
-    deps_pip = ["rapidfuzz"]
     deps_common = ["curl"]
 
-    SIMILARITY_THRESHOLD = 0.5
+    SIMILARITY_THRESHOLD = 0.8
     CANARY_LENGTH = 12
     MAX_RESULTS_FLOOD_PROTECTION = 50
 
@@ -416,12 +415,12 @@ async def _wildcard_canary_check(self, probe_scheme, probe_host, event, host_ip,
                 )
             return True
 
+        probe_simhash = self.helpers.simhash.hash(probe_response["response_data"])
+        wildcard_simhash = self.helpers.simhash.hash(wildcard_canary_response["response_data"])
+        similarity = self.helpers.simhash.similarity(probe_simhash, wildcard_simhash)
+
         # Compare original probe response with modified response
-        similarity = self.helpers.web.text_similarity(
-            probe_response["response_data"],
-            wildcard_canary_response["response_data"],
-            similarity_cache=self.similarity_cache,
-        )
+
         result = similarity <= self.SIMILARITY_THRESHOLD
 
         if not result:
@@ -755,14 +754,12 @@ def analyze_response(self, probe_host, probe_response, canary_response, event):
 
         # Calculate content similarity to canary (junk response)
         # Use probe hostname for normalization to remove hostname reflection differences
-        similarity = self.helpers.web.text_similarity(
-            canary_response["response_data"],
-            probe_response["response_data"],
-            normalization_filter=probe_host,
-            similarity_cache=self.similarity_cache,
-        )
 
-        # Debug logging only when we think we found a match
+        probe_simhash = self.helpers.simhash.hash(probe_response["response_data"], normalization_filter=probe_host)
+        canary_simhash = self.helpers.simhash.hash(canary_response["response_data"], normalization_filter=probe_host)
+
+        similarity = self.helpers.simhash.similarity(probe_simhash, canary_simhash)
+
         if similarity <= self.SIMILARITY_THRESHOLD:
             self.verbose(
                 f"POTENTIAL MATCH: {probe_host} vs canary - similarity: {similarity:.3f} (threshold: {self.SIMILARITY_THRESHOLD}), probe status: {probe_status}, canary status: {canary_status}"
@@ -791,11 +788,10 @@ async def _verify_canary_keyword(self, original_response, probe_url, is_https, b
             )
             return False
 
-        similarity = self.helpers.web.text_similarity(
-            original_response["response_data"],
-            keyword_canary_response["response_data"],
-            similarity_cache=self.similarity_cache,
-        )
+        original_simhash = self.helpers.simhash.hash(original_response["response_data"])
+        keyword_simhash = self.helpers.simhash.hash(keyword_canary_response["response_data"])
+        similarity = self.helpers.simhash.similarity(original_simhash, keyword_simhash)
+
         if similarity >= self.SIMILARITY_THRESHOLD:
             self.verbose(
                 f"Intentionally wrong hostname has a canary too similar to the original. Using probe url: {probe_url} - similarity: {similarity:.3f} above threshold {self.SIMILARITY_THRESHOLD} - Original: {original_response.get('http_code', 'N/A')} ({len(original_response.get('response_data', ''))} bytes), Current: {keyword_canary_response.get('http_code', 'N/A')} ({len(keyword_canary_response.get('response_data', ''))} bytes)"
@@ -832,11 +828,9 @@ async def _verify_canary_consistency(
             return True
 
         # Fallback - use similarity comparison for response data (allows slight differences)
-        similarity = self.helpers.web.text_similarity(
-            original_canary_response["response_data"],
-            consistency_canary_response["response_data"],
-            similarity_cache=self.similarity_cache,
-        )
+        original_simhash = self.helpers.simhash.hash(original_canary_response["response_data"])
+        consistency_simhash = self.helpers.simhash.hash(consistency_canary_response["response_data"])
+        similarity = self.helpers.simhash.similarity(original_simhash, consistency_simhash)
         if similarity < self.SIMILARITY_THRESHOLD:
             self.verbose(
                 f"CANARY SIMILARITY CHANGED for {normalized_url} - similarity: {similarity:.3f} below threshold {self.SIMILARITY_THRESHOLD} - Original: {original_canary_response.get('http_code', 'N/A')} ({len(original_canary_response.get('response_data', ''))} bytes), Current: {consistency_canary_response.get('http_code', 'N/A')} ({len(consistency_canary_response.get('response_data', ''))} bytes)"
diff --git a/bbot/modules/waf_bypass.py b/bbot/modules/waf_bypass.py
@@ -142,8 +142,15 @@ async def handle_event(self, event):
                 return
 
             # Store the response object for later comparison
-            self.content_fingerprints[url] = curl_response
-            self.debug(f"Stored response from {url} (content length: {len(curl_response['response_data'])})")
+            simhash = self.helpers.simhash.hash(curl_response["response_data"])
+            self.content_fingerprints[url] = {
+                "simhash": simhash,
+                "http_code": curl_response["http_code"],
+            }
+            self.critical(f"{simhash:0128b}")
+            self.debug(
+                f"Stored simhash of response from {url} (content length: {len(curl_response['response_data'])})"
+            )
 
             # Get CIDRs from the base domain of the protected domain
             base_dns = await self.helpers.dns.resolve(base_domain)
@@ -221,11 +228,10 @@ async def check_ip(self, ip, source_domain, protected_domain, source_event):
             self.debug(f"did not get original response for {matching_url}")
             return None
 
-        self.verbose(
-            f"Bypass attempt: {protected_domain} via {ip} (orig len {len(original_response['response_data'])}) from {source_domain}"
-        )
+        self.verbose(f"Bypass attempt: {protected_domain} via {ip} from {source_domain}")
 
         bypass_response = await self.get_url_content(matching_url, ip)
+        bypass_simhash = self.helpers.simhash.hash(bypass_response["response_data"])
         if not bypass_response:
             self.debug(f"Failed to get content through IP {ip} for URL {matching_url}")
             return None
@@ -238,19 +244,25 @@ async def check_ip(self, ip, source_domain, protected_domain, source_event):
         if bypass_response["http_code"] == 301 or bypass_response["http_code"] == 302:
             is_redirect = True
 
-        similarity = self.helpers.web.text_similarity(
-            original_response["response_data"],
-            bypass_response["response_data"],
-            similarity_cache=self.similarity_cache,
-        )
+        self.hugeinfo(f"{original_response['simhash']:0128b}")
+        self.hugeinfo(f"{bypass_simhash:0128b}")
+        similarity = self.helpers.simhash.similarity(original_response["simhash"], bypass_simhash)
+
+        self.critical(similarity)
+
+        # similarity = self.helpers.web.text_similarity(
+        #     original_response["response_data"],
+        #     bypass_response["response_data"],
+        #     similarity_cache=self.similarity_cache,
+        # )
 
         # For redirects, require exact match (1.0), otherwise use configured threshold
         required_threshold = 1.0 if is_redirect else self.similarity_threshold
         return (matching_url, ip, similarity, source_event) if similarity >= required_threshold else None
 
     async def finish(self):
-        self.debug(f"Found {len(self.protected_domains)} Protected Domains")
-        self.debug(f"Found {len(self.bypass_candidates)} Bypass Candidates")
+        self.critical(f"Found {len(self.protected_domains)} Protected Domains")
+        self.critical(f"Found {len(self.bypass_candidates)} Bypass Candidates")
 
         confirmed_bypasses = []  # [(protected_url, matching_ip, similarity)]
         all_ips = {}  # {ip: domain}
@@ -294,6 +306,8 @@ async def finish(self):
                                         self.debug(
                                             f"Added Neighbor IP ({ip} -> {n_ip_str}) as potential bypass IP derived from {domain}"
                                         )
+                    else:
+                        self.critical(f"IP {ip} is in CloudFlare IPS so we don't check as potential bypass")
 
         self.debug(f"\nFound {len(all_ips)} non-CloudFlare IPs to check: {all_ips}")
 
diff --git a/bbot/test/test_step_1/test_helpers.py b/bbot/test/test_step_1/test_helpers.py