Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions InsightEngine/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
)
from .utils import format_search_results_for_prompt
from .utils.config import Settings, settings
from .utils.deduplication import build_result_dedup_key

ENABLE_CLUSTERING: bool = True # 是否启用聚类采样
MAX_CLUSTERED_RESULTS: int = 50 # 聚类后最大返回结果数
Expand Down Expand Up @@ -378,8 +379,8 @@ def _deduplicate_results(self, results: List) -> List:
unique_results = []

for result in results:
# 使用URL或内容作为去重标识
identifier = result.url if result.url else result.title_or_content[:100]
# Use full content and source metadata for URL-less results.
identifier = build_result_dedup_key(result)
if identifier not in seen:
seen.add(identifier)
unique_results.append(result)
Expand Down
32 changes: 32 additions & 0 deletions InsightEngine/utils/deduplication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""
Search-result deduplication helpers.
"""

from typing import Any, Tuple


def build_result_dedup_key(result: Any) -> Tuple[Any, ...]:
"""
Build a stable deduplication key for a search result.

URL-backed results keep the historical URL-only behavior. Results without a
URL need the full content and source metadata so distinct comments or posts
are not collapsed just because they share a long prefix.
"""
url = getattr(result, "url", None)
if url:
return ("url", url)

publish_time = getattr(result, "publish_time", None)
if publish_time is not None and hasattr(publish_time, "isoformat"):
publish_time = publish_time.isoformat()

return (
"content",
getattr(result, "platform", None),
getattr(result, "content_type", None),
getattr(result, "source_table", None),
getattr(result, "author_nickname", None),
publish_time,
getattr(result, "title_or_content", "") or "",
)
61 changes: 61 additions & 0 deletions tests/test_insight_deduplication.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import importlib.util
from pathlib import Path
from types import SimpleNamespace


def load_deduplication_module():
module_path = (
Path(__file__).resolve().parents[1]
/ "InsightEngine"
/ "utils"
/ "deduplication.py"
)
spec = importlib.util.spec_from_file_location("insight_deduplication", module_path)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
return module


deduplication = load_deduplication_module()


def test_url_less_results_do_not_dedupe_by_truncated_prefix():
prefix = (
"Breaking: Major vulnerability found in popular framework "
"CVE-2026-1234 with remote code execution impact affecting millions "
"of users worldwide. "
)
first = SimpleNamespace(
url=None,
platform="weibo",
content_type="comment",
source_table="weibo_note_comment",
author_nickname="alice",
publish_time=None,
title_or_content=prefix + "No patch is needed.",
)
second = SimpleNamespace(
url=None,
platform="weibo",
content_type="comment",
source_table="weibo_note_comment",
author_nickname="alice",
publish_time=None,
title_or_content=prefix + "Patch immediately.",
)

assert first.title_or_content[:100] == second.title_or_content[:100]
assert deduplication.build_result_dedup_key(first) != deduplication.build_result_dedup_key(second)


def test_url_results_keep_url_based_deduplication():
first = SimpleNamespace(
url="https://example.com/post/1",
title_or_content="Original content",
)
second = SimpleNamespace(
url="https://example.com/post/1",
title_or_content="Updated content",
)

assert deduplication.build_result_dedup_key(first) == deduplication.build_result_dedup_key(second)