Skip to content

Commit 323284e

Browse files
committed
add simhash helper
1 parent 9576232 commit 323284e

File tree

1 file changed

+89
-0
lines changed

1 file changed

+89
-0
lines changed

bbot/core/helpers/simhash.py

Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
import xxhash
2+
import re
3+
4+
5+
class SimHash:
6+
def __init__(self, bits=64):
7+
self.bits = bits
8+
9+
def _truncate_content(self, content):
10+
"""
11+
Truncate large content for similarity comparison to improve performance.
12+
13+
Truncation rules:
14+
- If content <= 3072 bytes: return as-is
15+
- If content > 3072 bytes: return first 2048 bytes + last 1024 bytes
16+
"""
17+
content_length = len(content)
18+
19+
# No truncation needed for smaller content
20+
if content_length <= 3072:
21+
return content
22+
23+
# Truncate: first 2048 + last 1024 bytes
24+
first_part = content[:2048]
25+
last_part = content[-1024:]
26+
27+
return first_part + last_part
28+
29+
def _normalize_text(self, text, normalization_filter):
30+
"""
31+
Normalize text by removing the normalization filter from the text.
32+
"""
33+
return text.replace(normalization_filter, "")
34+
35+
def _get_features(self, text):
36+
"""Extract 3-character shingles as features"""
37+
width = 3
38+
text = text.lower()
39+
# Remove non-word characters
40+
text = re.sub(r"[^\w]+", "", text)
41+
# Create 3-character shingles
42+
return [text[i : i + width] for i in range(max(len(text) - width + 1, 1))]
43+
44+
def _hash_feature(self, feature):
45+
"""Return a hash of a feature using xxHash"""
46+
return xxhash.xxh64(feature.encode("utf-8")).intdigest()
47+
48+
def hash(self, text, truncate=True, normalization_filter=None):
49+
"""
50+
Generate a SimHash fingerprint for the given text.
51+
52+
Args:
53+
text (str): The text to hash
54+
truncate (bool): Whether to truncate large text for performance. Defaults to True.
55+
When enabled, text larger than 4KB is truncated to first 2KB + last 1KB for comparison.
56+
57+
Returns:
58+
int: The SimHash fingerprint
59+
"""
60+
# Apply truncation if enabled
61+
if truncate:
62+
text = self._truncate_content(text)
63+
64+
if normalization_filter:
65+
text = self._normalize_text(text, normalization_filter)
66+
67+
vector = [0] * self.bits
68+
features = self._get_features(text)
69+
70+
for feature in features:
71+
hv = self._hash_feature(feature)
72+
for i in range(self.bits):
73+
bit = (hv >> i) & 1
74+
vector[i] += 1 if bit else -1
75+
76+
# Final fingerprint
77+
fingerprint = 0
78+
for i, val in enumerate(vector):
79+
if val >= 0:
80+
fingerprint |= 1 << i
81+
return fingerprint
82+
83+
def similarity(self, hash1, hash2):
84+
"""
85+
Compute similarity between two SimHashes as a value between 0.0 and 1.0.
86+
"""
87+
# Hamming distance: count of differing bits
88+
diff = (hash1 ^ hash2).bit_count()
89+
return 1.0 - (diff / self.bits)

0 commit comments

Comments
 (0)