Skip to content

Commit eb7f570

Browse files
committed
change to simhash for comparison
1 parent df66c25 commit eb7f570

File tree

5 files changed

+211
-167
lines changed

5 files changed

+211
-167
lines changed

bbot/core/helpers/helper.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
from .interactsh import Interactsh
1616
from .yara_helper import YaraHelper
1717
from .depsinstaller import DepsInstaller
18+
from .simhash import SimHash
1819
from .async_helpers import get_event_loop
1920

2021
from bbot.scanner.target import BaseTarget
@@ -91,6 +92,7 @@ def __init__(self, preset):
9192
self._dns = None
9293
self._web = None
9394
self._asn = None
95+
self._simhash = None
9496
self.config_aware_validators = self.validators.Validators(self)
9597
self.depsinstaller = DepsInstaller(self)
9698
self.word_cloud = WordCloud(self)
@@ -114,6 +116,12 @@ def asn(self):
114116
self._asn = ASNHelper(self)
115117
return self._asn
116118

119+
@property
120+
def simhash(self):
121+
if self._simhash is None:
122+
self._simhash = SimHash()
123+
return self._simhash
124+
117125
@property
118126
def cloud(self):
119127
if self._cloud is None:

bbot/core/helpers/web/web.py

Lines changed: 0 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -626,134 +626,3 @@ def response_to_json(self, response):
626626
}
627627

628628
return j
629-
630-
def text_similarity(self, text1, text2, normalization_filter=None, similarity_cache=None, truncate=True):
631-
"""
632-
Calculate similarity between two text strings using rapidfuzz with performance optimizations.
633-
634-
This method compares two text strings and returns a similarity score between 0.0 (completely
635-
different) and 1.0 (identical). It includes several optimizations:
636-
- Fast exact equality check for identical text
637-
- Optional content truncation for large text (>4KB) to improve performance
638-
- Optional caching using xxHash for fast cache key generation (bring your own similarity_cache dict)
639-
- Text normalization filtering to remove dynamic content
640-
641-
The method is particularly useful for:
642-
- Comparing HTTP response bodies
643-
- Content change detection
644-
- Wildcard detection in web applications
645-
- Deduplication of similar text content
646-
647-
Args:
648-
text1 (str): First text string to compare
649-
text2 (str): Second text string to compare
650-
normalization_filter (str, optional): String to remove from both texts before comparison.
651-
Useful for removing hostnames, timestamps, or other dynamic content that would skew
652-
similarity calculations.
653-
similarity_cache (dict, optional): Cache dictionary for storing/retrieving similarity results.
654-
Uses xxHash-based keys for fast lookups. If provided, results will be cached to improve
655-
performance on repeated comparisons.
656-
truncate (bool, optional): Whether to truncate large text for performance. Defaults to True.
657-
When enabled, text larger than 4KB is truncated to first 2KB + last 1KB for comparison.
658-
659-
Returns:
660-
float: Similarity score between 0.0 (completely different) and 1.0 (identical).
661-
Values closer to 1.0 indicate more similar content.
662-
663-
Examples:
664-
Basic similarity comparison:
665-
>>> similarity = self.helpers.web.text_similarity(text1, text2)
666-
>>> if similarity > 0.8:
667-
>>> print("Texts are very similar")
668-
669-
With content normalization filtering:
670-
>>> similarity = self.helpers.web.text_similarity(
671-
>>> baseline_text,
672-
>>> probe_text,
673-
>>> normalization_filter="example.com"
674-
>>> )
675-
676-
With caching for performance:
677-
>>> cache = {}
678-
>>> similarity = self.helpers.web.text_similarity(
679-
>>> text1,
680-
>>> text2,
681-
>>> similarity_cache=cache
682-
>>> )
683-
684-
Disable truncation for exact comparison:
685-
>>> similarity = self.helpers.web.text_similarity(
686-
>>> text1,
687-
>>> text2,
688-
>>> truncate=False
689-
>>> )
690-
691-
Performance Notes:
692-
- Text larger than 4KB is automatically truncated to first 2KB + last 1KB for comparison (when truncate=True)
693-
- Exact equality is checked first for optimal performance on identical text
694-
- Cache keys are order-independent (comparing A,B gives same cache key as B,A)
695-
- Disabling truncation may impact performance on very large text but provides more accurate results
696-
"""
697-
698-
# Fastest check: exact equality (very common for identical content)
699-
if text1 == text2:
700-
return 1.0 # Exactly the same
701-
702-
from rapidfuzz import fuzz
703-
import xxhash
704-
705-
# Normalize by removing specified content to eliminate differences
706-
if normalization_filter:
707-
text1 = text1.replace(normalization_filter, "")
708-
text2 = text2.replace(normalization_filter, "")
709-
710-
# Create fast hashes for cache key using xxHash
711-
text1_hash = xxhash.xxh64(text1.encode() if isinstance(text1, str) else text1).hexdigest()
712-
text2_hash = xxhash.xxh64(text2.encode() if isinstance(text2, str) else text2).hexdigest()
713-
714-
# Create cache key (order-independent) - include truncate setting in cache key
715-
cache_key = tuple(sorted([text1_hash, text2_hash]) + [str(truncate)])
716-
717-
# Check cache first if provided
718-
if similarity_cache is not None and cache_key in similarity_cache:
719-
return similarity_cache[cache_key]
720-
721-
# Calculate similarity with optional truncation for performance
722-
if truncate and (len(text1) > 4096 or len(text2) > 4096):
723-
# Take first 2048 bytes + last 1024 bytes for comparison
724-
text1_truncated = self._truncate_content_for_similarity(text1)
725-
text2_truncated = self._truncate_content_for_similarity(text2)
726-
similarity = fuzz.ratio(text1_truncated, text2_truncated) / 100.0
727-
else:
728-
# Use full content for comparison
729-
similarity = fuzz.ratio(text1, text2) / 100.0
730-
731-
# Cache the result if cache provided
732-
if similarity_cache is not None:
733-
similarity_cache[cache_key] = similarity
734-
735-
return similarity
736-
737-
def _truncate_content_for_similarity(self, content):
738-
"""
739-
Truncate content for similarity comparison to improve performance.
740-
741-
Truncation rules:
742-
- If content <= 3072 bytes (2048 + 1024): return as-is
743-
- If content > 3072 bytes: return first 2048 bytes + last 1024 bytes
744-
745-
This captures:
746-
- First 2048 bytes: HTTP headers, HTML head, title, main content start
747-
- Last 1024 bytes: Footers, closing scripts, HTML closing tags
748-
"""
749-
content_length = len(content)
750-
751-
# No truncation needed for smaller content
752-
if content_length <= 3072:
753-
return content
754-
755-
# Truncate: first 2048 + last 1024 bytes
756-
first_part = content[:2048]
757-
last_part = content[-1024:]
758-
759-
return first_part + last_part

bbot/modules/virtualhost.py

Lines changed: 18 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@ class virtualhost(BaseModule):
1212
flags = ["active", "aggressive", "slow", "deadly"]
1313
meta = {"description": "Fuzz for virtual hosts", "created_date": "2022-05-02", "author": "@liquidsec"}
1414

15-
deps_pip = ["rapidfuzz"]
1615
deps_common = ["curl"]
1716

18-
SIMILARITY_THRESHOLD = 0.5
17+
SIMILARITY_THRESHOLD = 0.8
1918
CANARY_LENGTH = 12
2019
MAX_RESULTS_FLOOD_PROTECTION = 50
2120

@@ -416,12 +415,12 @@ async def _wildcard_canary_check(self, probe_scheme, probe_host, event, host_ip,
416415
)
417416
return True
418417

418+
probe_simhash = self.helpers.simhash.hash(probe_response["response_data"])
419+
wildcard_simhash = self.helpers.simhash.hash(wildcard_canary_response["response_data"])
420+
similarity = self.helpers.simhash.similarity(probe_simhash, wildcard_simhash)
421+
419422
# Compare original probe response with modified response
420-
similarity = self.helpers.web.text_similarity(
421-
probe_response["response_data"],
422-
wildcard_canary_response["response_data"],
423-
similarity_cache=self.similarity_cache,
424-
)
423+
425424
result = similarity <= self.SIMILARITY_THRESHOLD
426425

427426
if not result:
@@ -755,14 +754,12 @@ def analyze_response(self, probe_host, probe_response, canary_response, event):
755754

756755
# Calculate content similarity to canary (junk response)
757756
# Use probe hostname for normalization to remove hostname reflection differences
758-
similarity = self.helpers.web.text_similarity(
759-
canary_response["response_data"],
760-
probe_response["response_data"],
761-
normalization_filter=probe_host,
762-
similarity_cache=self.similarity_cache,
763-
)
764757

765-
# Debug logging only when we think we found a match
758+
probe_simhash = self.helpers.simhash.hash(probe_response["response_data"], normalization_filter=probe_host)
759+
canary_simhash = self.helpers.simhash.hash(canary_response["response_data"], normalization_filter=probe_host)
760+
761+
similarity = self.helpers.simhash.similarity(probe_simhash, canary_simhash)
762+
766763
if similarity <= self.SIMILARITY_THRESHOLD:
767764
self.verbose(
768765
f"POTENTIAL MATCH: {probe_host} vs canary - similarity: {similarity:.3f} (threshold: {self.SIMILARITY_THRESHOLD}), probe status: {probe_status}, canary status: {canary_status}"
@@ -791,11 +788,10 @@ async def _verify_canary_keyword(self, original_response, probe_url, is_https, b
791788
)
792789
return False
793790

794-
similarity = self.helpers.web.text_similarity(
795-
original_response["response_data"],
796-
keyword_canary_response["response_data"],
797-
similarity_cache=self.similarity_cache,
798-
)
791+
original_simhash = self.helpers.simhash.hash(original_response["response_data"])
792+
keyword_simhash = self.helpers.simhash.hash(keyword_canary_response["response_data"])
793+
similarity = self.helpers.simhash.similarity(original_simhash, keyword_simhash)
794+
799795
if similarity >= self.SIMILARITY_THRESHOLD:
800796
self.verbose(
801797
f"Intentionally wrong hostname has a canary too similar to the original. Using probe url: {probe_url} - similarity: {similarity:.3f} above threshold {self.SIMILARITY_THRESHOLD} - Original: {original_response.get('http_code', 'N/A')} ({len(original_response.get('response_data', ''))} bytes), Current: {keyword_canary_response.get('http_code', 'N/A')} ({len(keyword_canary_response.get('response_data', ''))} bytes)"
@@ -832,11 +828,9 @@ async def _verify_canary_consistency(
832828
return True
833829

834830
# Fallback - use similarity comparison for response data (allows slight differences)
835-
similarity = self.helpers.web.text_similarity(
836-
original_canary_response["response_data"],
837-
consistency_canary_response["response_data"],
838-
similarity_cache=self.similarity_cache,
839-
)
831+
original_simhash = self.helpers.simhash.hash(original_canary_response["response_data"])
832+
consistency_simhash = self.helpers.simhash.hash(consistency_canary_response["response_data"])
833+
similarity = self.helpers.simhash.similarity(original_simhash, consistency_simhash)
840834
if similarity < self.SIMILARITY_THRESHOLD:
841835
self.verbose(
842836
f"CANARY SIMILARITY CHANGED for {normalized_url} - similarity: {similarity:.3f} below threshold {self.SIMILARITY_THRESHOLD} - Original: {original_canary_response.get('http_code', 'N/A')} ({len(original_canary_response.get('response_data', ''))} bytes), Current: {consistency_canary_response.get('http_code', 'N/A')} ({len(consistency_canary_response.get('response_data', ''))} bytes)"

bbot/modules/waf_bypass.py

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -142,8 +142,15 @@ async def handle_event(self, event):
142142
return
143143

144144
# Store the response object for later comparison
145-
self.content_fingerprints[url] = curl_response
146-
self.debug(f"Stored response from {url} (content length: {len(curl_response['response_data'])})")
145+
simhash = self.helpers.simhash.hash(curl_response["response_data"])
146+
self.content_fingerprints[url] = {
147+
"simhash": simhash,
148+
"http_code": curl_response["http_code"],
149+
}
150+
self.critical(f"{simhash:0128b}")
151+
self.debug(
152+
f"Stored simhash of response from {url} (content length: {len(curl_response['response_data'])})"
153+
)
147154

148155
# Get CIDRs from the base domain of the protected domain
149156
base_dns = await self.helpers.dns.resolve(base_domain)
@@ -221,11 +228,10 @@ async def check_ip(self, ip, source_domain, protected_domain, source_event):
221228
self.debug(f"did not get original response for {matching_url}")
222229
return None
223230

224-
self.verbose(
225-
f"Bypass attempt: {protected_domain} via {ip} (orig len {len(original_response['response_data'])}) from {source_domain}"
226-
)
231+
self.verbose(f"Bypass attempt: {protected_domain} via {ip} from {source_domain}")
227232

228233
bypass_response = await self.get_url_content(matching_url, ip)
234+
bypass_simhash = self.helpers.simhash.hash(bypass_response["response_data"])
229235
if not bypass_response:
230236
self.debug(f"Failed to get content through IP {ip} for URL {matching_url}")
231237
return None
@@ -238,19 +244,25 @@ async def check_ip(self, ip, source_domain, protected_domain, source_event):
238244
if bypass_response["http_code"] == 301 or bypass_response["http_code"] == 302:
239245
is_redirect = True
240246

241-
similarity = self.helpers.web.text_similarity(
242-
original_response["response_data"],
243-
bypass_response["response_data"],
244-
similarity_cache=self.similarity_cache,
245-
)
247+
self.hugeinfo(f"{original_response['simhash']:0128b}")
248+
self.hugeinfo(f"{bypass_simhash:0128b}")
249+
similarity = self.helpers.simhash.similarity(original_response["simhash"], bypass_simhash)
250+
251+
self.critical(similarity)
252+
253+
# similarity = self.helpers.web.text_similarity(
254+
# original_response["response_data"],
255+
# bypass_response["response_data"],
256+
# similarity_cache=self.similarity_cache,
257+
# )
246258

247259
# For redirects, require exact match (1.0), otherwise use configured threshold
248260
required_threshold = 1.0 if is_redirect else self.similarity_threshold
249261
return (matching_url, ip, similarity, source_event) if similarity >= required_threshold else None
250262

251263
async def finish(self):
252-
self.debug(f"Found {len(self.protected_domains)} Protected Domains")
253-
self.debug(f"Found {len(self.bypass_candidates)} Bypass Candidates")
264+
self.critical(f"Found {len(self.protected_domains)} Protected Domains")
265+
self.critical(f"Found {len(self.bypass_candidates)} Bypass Candidates")
254266

255267
confirmed_bypasses = [] # [(protected_url, matching_ip, similarity)]
256268
all_ips = {} # {ip: domain}
@@ -294,6 +306,8 @@ async def finish(self):
294306
self.debug(
295307
f"Added Neighbor IP ({ip} -> {n_ip_str}) as potential bypass IP derived from {domain}"
296308
)
309+
else:
310+
self.critical(f"IP {ip} is in CloudFlare IPS so we don't check as potential bypass")
297311

298312
self.debug(f"\nFound {len(all_ips)} non-CloudFlare IPs to check: {all_ips}")
299313

0 commit comments

Comments
 (0)