diff --git a/InsightEngine/agent.py b/InsightEngine/agent.py index e58fa57c0..3670dba5c 100644 --- a/InsightEngine/agent.py +++ b/InsightEngine/agent.py @@ -32,6 +32,7 @@ ) from .utils import format_search_results_for_prompt from .utils.config import Settings, settings +from .utils.deduplication import build_result_dedup_key ENABLE_CLUSTERING: bool = True # 是否启用聚类采样 MAX_CLUSTERED_RESULTS: int = 50 # 聚类后最大返回结果数 @@ -378,8 +379,8 @@ def _deduplicate_results(self, results: List) -> List: unique_results = [] for result in results: - # 使用URL或内容作为去重标识 - identifier = result.url if result.url else result.title_or_content[:100] + # Use full content and source metadata for URL-less results. + identifier = build_result_dedup_key(result) if identifier not in seen: seen.add(identifier) unique_results.append(result) diff --git a/InsightEngine/utils/deduplication.py b/InsightEngine/utils/deduplication.py new file mode 100644 index 000000000..5a65c1e1d --- /dev/null +++ b/InsightEngine/utils/deduplication.py @@ -0,0 +1,32 @@ +""" +Search-result deduplication helpers. +""" + +from typing import Any, Tuple + + +def build_result_dedup_key(result: Any) -> Tuple[Any, ...]: + """ + Build a stable deduplication key for a search result. + + URL-backed results keep the historical URL-only behavior. Results without a + URL need the full content and source metadata so distinct comments or posts + are not collapsed just because they share a long prefix. + """ + url = getattr(result, "url", None) + if url: + return ("url", url) + + publish_time = getattr(result, "publish_time", None) + if publish_time is not None and hasattr(publish_time, "isoformat"): + publish_time = publish_time.isoformat() + + return ( + "content", + getattr(result, "platform", None), + getattr(result, "content_type", None), + getattr(result, "source_table", None), + getattr(result, "author_nickname", None), + publish_time, + getattr(result, "title_or_content", "") or "", + ) diff --git a/tests/test_insight_deduplication.py b/tests/test_insight_deduplication.py new file mode 100644 index 000000000..6c0643f9a --- /dev/null +++ b/tests/test_insight_deduplication.py @@ -0,0 +1,61 @@ +import importlib.util +from pathlib import Path +from types import SimpleNamespace + + +def load_deduplication_module(): + module_path = ( + Path(__file__).resolve().parents[1] + / "InsightEngine" + / "utils" + / "deduplication.py" + ) + spec = importlib.util.spec_from_file_location("insight_deduplication", module_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + return module + + +deduplication = load_deduplication_module() + + +def test_url_less_results_do_not_dedupe_by_truncated_prefix(): + prefix = ( + "Breaking: Major vulnerability found in popular framework " + "CVE-2026-1234 with remote code execution impact affecting millions " + "of users worldwide. " + ) + first = SimpleNamespace( + url=None, + platform="weibo", + content_type="comment", + source_table="weibo_note_comment", + author_nickname="alice", + publish_time=None, + title_or_content=prefix + "No patch is needed.", + ) + second = SimpleNamespace( + url=None, + platform="weibo", + content_type="comment", + source_table="weibo_note_comment", + author_nickname="alice", + publish_time=None, + title_or_content=prefix + "Patch immediately.", + ) + + assert first.title_or_content[:100] == second.title_or_content[:100] + assert deduplication.build_result_dedup_key(first) != deduplication.build_result_dedup_key(second) + + +def test_url_results_keep_url_based_deduplication(): + first = SimpleNamespace( + url="https://example.com/post/1", + title_or_content="Original content", + ) + second = SimpleNamespace( + url="https://example.com/post/1", + title_or_content="Updated content", + ) + + assert deduplication.build_result_dedup_key(first) == deduplication.build_result_dedup_key(second)