diff --git a/AGENTS.md b/AGENTS.md index 981fc974..dcc3b9a4 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -4,6 +4,7 @@ Never run git commands that make any changes. (`git status` and `git diff` are fine) Exceptions: `git push`, `git worktree`, `git branch` (for tracking setup), as instructed below. +Exceptions: `git push`, `git worktree`, `git branch` (for tracking setup), as instructed below. **NEVER COMMIT CODE.** Do not run `git commit` or any other git commands that make changes to the repository. Exception: Worktrees/Branches below. @@ -11,28 +12,10 @@ that make changes to the repository. Exception: Worktrees/Branches below. When moving, copying or deleting files, use the git commands: `git mv`, `git cp`, `git rm` -## Worktrees and Branches - -- Each session uses its own worktree with a feature branch -- Create worktrees with: `git worktree add ../- -b ` -- Push the branch to the `me` remote: `git push me ` -- Set upstream to `me/`: `git branch --set-upstream-to me/` -- **Never** upstream to `me/main` — that must stay identical to `origin/main` -- The worktree directory name should be `-` (sibling of the main checkout) -- **Work in the worktree directory**, not the main checkout — edit files there, run tests there -- VS Code may show buffers from the main checkout; ignore those when working in a worktree. - When in doubt, verify edits landed on disk with `cat` or `grep` in the terminal. - -## Debugging discipline - -- When a bug seems impossible, suspect stale files or wrong working directory — not exotic causes. -- If you're tempted to blame installed package versions, `__pycache__`, or similar, - **stop and ask the user** before investigating further. You're probably on the wrong track. - -**Whenever the user tells you how to do something, states a preference, or corrects you, -extract a general rule and add it to AGENTS.md** (unless it's already covered -- maybe -reformulate since it apparently didn't work). This applies even without being asked. -In all cases show what you added to AGENTS.md. +When I ask to update AGENTS.md (even if maybe) extract a general rule from what I said +before and update AGENTS.md (unless it's already in there -- maybe reformulate since +it apparently didn't work). Also, when it looks like I state a general rule, add it to +AGENTS.md. In all cases show what you added to AGENTS.md. - Don't use '!' on the command line, it's some bash magic (even inside single quotes) - When running 'make' commands, do not use the venv (the Makefile uses 'uv run') diff --git a/pyproject.toml b/pyproject.toml index 5a1367e9..328ff6f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -64,6 +64,7 @@ asyncio_default_fixture_loop_scope = "function" testpaths = ["tests"] [tool.pyright] +extraPaths = ["src", "tools"] reportUnusedVariable = true reportUnusedImport = true reportDuplicateImport = true diff --git a/src/typeagent/aitools/vectorbase.py b/src/typeagent/aitools/vectorbase.py index f67812e6..7a3c4549 100644 --- a/src/typeagent/aitools/vectorbase.py +++ b/src/typeagent/aitools/vectorbase.py @@ -13,15 +13,52 @@ ) from .model_adapters import create_embedding_model +DEFAULT_MIN_SCORE = 0.85 + +# Empirical defaults for built-in OpenAI embedding models. +# These values come from the Adrian Tchaikovsky Episode 53 +# search benchmark in `tools/repeat_embedding_benchmarks.py`, using an +# exhaustive 0.00..1.00 min_score sweep. The benchmark recomputes corpus and +# query embeddings for each model and ignores the fixture's serialized +# embedding sidecar. Scores are normalized from cosine similarity to the public +# 0..1 min_score scale. +# These are repository defaults for known models, not universal truths. +# Unknown models keep the long-standing fallback score of 0.85. Callers can +# always override `min_score` explicitly for their own use cases or models. We +# intentionally leave `max_matches` out of this table: the benchmark still +# reports a best `max_hits` row, but the library default remains `None` unless +# a caller opts into a specific limit. +MODEL_DEFAULT_MIN_SCORES: dict[str, float] = { + "text-embedding-3-large": 0.74, + "text-embedding-3-small": 0.73, + "text-embedding-ada-002": 0.93, +} + + +def get_default_min_score(model_name: str) -> float: + """Return the repository default score cutoff for a known model name.""" + + return MODEL_DEFAULT_MIN_SCORES.get(model_name, DEFAULT_MIN_SCORE) + + +def cosine_to_score(cosine_similarity: np.ndarray) -> np.ndarray: + """Map cosine similarity from -1..1 to the public 0..1 score scale.""" + + return np.clip((cosine_similarity + 1.0) / 2.0, 0.0, 1.0) + @dataclass class ScoredInt: + """Associate an integer ordinal with its similarity score.""" + item: int score: float @dataclass class TextEmbeddingIndexSettings: + """Runtime settings for embedding-backed fuzzy lookup.""" + embedding_model: IEmbeddingModel min_score: float # Between 0.0 and 1.0 max_matches: int | None # >= 1; None means no limit @@ -34,10 +71,12 @@ def __init__( max_matches: int | None = None, batch_size: int | None = None, ): - self.min_score = min_score if min_score is not None else 0.85 + self.embedding_model = embedding_model or create_embedding_model() + model_name = getattr(self.embedding_model, "model_name", "") + default_min_score = get_default_min_score(model_name) + self.min_score = min_score if min_score is not None else default_min_score self.max_matches = max_matches if max_matches and max_matches >= 1 else None self.batch_size = batch_size if batch_size and batch_size >= 1 else 8 - self.embedding_model = embedding_model or create_embedding_model() class VectorBase: @@ -76,20 +115,19 @@ def __bool__(self) -> bool: def add_embedding( self, key: str | None, embedding: NormalizedEmbedding | list[float] ) -> None: - if isinstance(embedding, list): - embedding = np.array(embedding, dtype=np.float32) + embedding_array = np.asarray(embedding, dtype=np.float32) if self._embedding_size == 0: - self._set_embedding_size(len(embedding)) + self._set_embedding_size(len(embedding_array)) self._vectors.shape = (0, self._embedding_size) - if len(embedding) != self._embedding_size: + if len(embedding_array) != self._embedding_size: raise ValueError( f"Embedding size mismatch: expected {self._embedding_size}, " - f"got {len(embedding)}" + f"got {len(embedding_array)}" ) - embeddings = embedding.reshape(1, -1) # Make it 2D: 1xN + embeddings = embedding_array.reshape(1, -1) # Make it 2D: 1xN self._vectors = np.append(self._vectors, embeddings, axis=0) if key is not None: - self._model.add_embedding(key, embedding) + self._model.add_embedding(key, embedding_array) def add_embeddings( self, keys: None | list[str], embeddings: NormalizedEmbeddings @@ -135,7 +173,7 @@ def fuzzy_lookup_embedding( min_score = 0.0 if len(self._vectors) == 0: return [] - scores = np.dot(self._vectors, embedding) + scores = cosine_to_score(np.dot(self._vectors, embedding)) if predicate is None: # Stay in numpy: filter by score, then top-k via argpartition. indices = np.flatnonzero(scores >= min_score) @@ -177,7 +215,7 @@ def fuzzy_lookup_embedding_in_subset( return [] # Compute dot products only for the subset instead of all vectors. subset = np.asarray(ordinals_of_subset) - scores = np.dot(self._vectors[subset], embedding) + scores = cosine_to_score(np.dot(self._vectors[subset], embedding)) indices = np.flatnonzero(scores >= min_score) if len(indices) == 0: return [] @@ -238,7 +276,7 @@ def deserialize(self, data: NormalizedEmbeddings | None) -> None: return if self._embedding_size == 0: if data.ndim < 2 or data.shape[0] == 0: - # Empty data — can't determine size; just clear. + # Empty data can't determine size; just clear. self.clear() return self._set_embedding_size(data.shape[1]) diff --git a/src/typeagent/knowpro/convsettings.py b/src/typeagent/knowpro/convsettings.py index acf559fe..bd05c19e 100644 --- a/src/typeagent/knowpro/convsettings.py +++ b/src/typeagent/knowpro/convsettings.py @@ -12,6 +12,9 @@ from ..aitools.vectorbase import TextEmbeddingIndexSettings from .interfaces import IKnowledgeExtractor, IStorageProvider +DEFAULT_RELATED_TERM_MIN_SCORE = 0.85 +DEFAULT_MESSAGE_TEXT_MIN_SCORE = 0.7 + @dataclass class MessageTextIndexSettings: @@ -54,13 +57,13 @@ def __init__( # All settings share the same model, so they share the embedding cache. model = model or create_embedding_model(retrier=embed_retrier) self.embedding_model = model - min_score = 0.85 + min_score = DEFAULT_RELATED_TERM_MIN_SCORE self.related_term_index_settings = RelatedTermIndexSettings( TextEmbeddingIndexSettings(model, min_score=min_score, max_matches=50) ) self.thread_settings = TextEmbeddingIndexSettings(model, min_score=min_score) self.message_text_index_settings = MessageTextIndexSettings( - TextEmbeddingIndexSettings(model, min_score=0.7) + TextEmbeddingIndexSettings(model, min_score=DEFAULT_MESSAGE_TEXT_MIN_SCORE) ) self.semantic_ref_index_settings = SemanticRefIndexSettings( concurrency=4, diff --git a/tests/test_benchmark_embeddings.py b/tests/test_benchmark_embeddings.py new file mode 100644 index 00000000..5b6f6815 --- /dev/null +++ b/tests/test_benchmark_embeddings.py @@ -0,0 +1,417 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from importlib.util import module_from_spec, spec_from_file_location +import json +from pathlib import Path + +import numpy as np +import pytest + +from typeagent.aitools.embeddings import NormalizedEmbedding, NormalizedEmbeddings + +MODULE_PATH = ( + Path(__file__).resolve().parent.parent / "tools" / "benchmark_embeddings.py" +) +SPEC = spec_from_file_location("benchmark_embeddings_for_test", MODULE_PATH) +assert SPEC is not None +assert SPEC.loader is not None +BENCHMARK_EMBEDDINGS = module_from_spec(SPEC) +SPEC.loader.exec_module(BENCHMARK_EMBEDDINGS) + +BenchmarkRow = BENCHMARK_EMBEDDINGS.BenchmarkRow +AnswerMetrics = BENCHMARK_EMBEDDINGS.AnswerMetrics +PipelineMetrics = BENCHMARK_EMBEDDINGS.PipelineMetrics +RelatedTermMetrics = BENCHMARK_EMBEDDINGS.RelatedTermMetrics +RelatedTermQueryCase = BENCHMARK_EMBEDDINGS.RelatedTermQueryCase +SearchMetrics = BENCHMARK_EMBEDDINGS.SearchMetrics +build_float_range = BENCHMARK_EMBEDDINGS.build_float_range +create_benchmark_conversation_settings = ( + BENCHMARK_EMBEDDINGS.create_benchmark_conversation_settings +) +evaluate_answer_queries = BENCHMARK_EMBEDDINGS.evaluate_answer_queries +evaluate_related_term_queries = BENCHMARK_EMBEDDINGS.evaluate_related_term_queries +load_corpus_metadata = BENCHMARK_EMBEDDINGS.load_corpus_metadata +load_pipeline_queries = BENCHMARK_EMBEDDINGS.load_pipeline_queries +load_message_texts = BENCHMARK_EMBEDDINGS.load_message_texts +load_related_term_queries = BENCHMARK_EMBEDDINGS.load_related_term_queries +load_related_term_texts = BENCHMARK_EMBEDDINGS.load_related_term_texts +parse_float_list = BENCHMARK_EMBEDDINGS.parse_float_list +resolve_min_scores = BENCHMARK_EMBEDDINGS.resolve_min_scores +select_best_row = BENCHMARK_EMBEDDINGS.select_best_row + + +class FakeEmbeddingModel: + """Minimal embedding model stub for settings tests.""" + + def __init__(self, model_name: str) -> None: + self.model_name = model_name + + def add_embedding(self, key: str, embedding: NormalizedEmbedding) -> None: + del key, embedding + + async def get_embedding_nocache(self, input: str) -> NormalizedEmbedding: + del input + return np.array([1.0], dtype=np.float32) + + async def get_embeddings_nocache(self, input: list[str]) -> NormalizedEmbeddings: + del input + return np.array([[1.0]], dtype=np.float32) + + async def get_embedding(self, key: str) -> NormalizedEmbedding: + del key + return np.array([1.0], dtype=np.float32) + + async def get_embeddings(self, keys: list[str]) -> NormalizedEmbeddings: + del keys + return np.array([[1.0]], dtype=np.float32) + + +def make_row( + min_score: float, + max_hits: int, + hit_rate: float, + mean_reciprocal_rank: float, + semantic_score: float | None = None, + pipeline_hit_rate: float | None = None, + pipeline_mean_reciprocal_rank: float | None = None, + related_hit_rate: float | None = None, + related_mean_reciprocal_rank: float | None = None, +) -> BenchmarkRow: + """Build a benchmark row without repeating nested metrics boilerplate.""" + + answer_metrics = ( + AnswerMetrics( + answerable_support=semantic_score / 100, + no_answer_rejection_rate=0.0, + semantic_score=semantic_score, + ) + if semantic_score is not None + else None + ) + related_metrics = ( + RelatedTermMetrics( + hit_rate=related_hit_rate, + mean_reciprocal_rank=related_mean_reciprocal_rank, + mean_result_count=10.0, + ) + if related_hit_rate is not None and related_mean_reciprocal_rank is not None + else None + ) + pipeline_metrics = ( + PipelineMetrics( + hit_rate=pipeline_hit_rate, + mean_reciprocal_rank=pipeline_mean_reciprocal_rank, + mean_result_count=10.0, + ) + if pipeline_hit_rate is not None and pipeline_mean_reciprocal_rank is not None + else None + ) + return BenchmarkRow( + min_score=min_score, + max_hits=max_hits, + metrics=SearchMetrics( + hit_rate=hit_rate, + mean_reciprocal_rank=mean_reciprocal_rank, + ), + pipeline_metrics=pipeline_metrics, + related_metrics=related_metrics, + answer_metrics=answer_metrics, + ) + + +@pytest.mark.parametrize( + ("model_name", "expected_min_score"), + [ + ("text-embedding-3-large", 0.74), + ("text-embedding-3-small", 0.73), + ("text-embedding-ada-002", 0.93), + ], +) +def test_benchmark_conversation_settings_use_model_default( + model_name: str, + expected_min_score: float, +) -> None: + settings = create_benchmark_conversation_settings(FakeEmbeddingModel(model_name)) + + assert ( + settings.related_term_index_settings.embedding_index_settings.min_score + == expected_min_score + ) + assert settings.thread_settings.min_score == expected_min_score + assert ( + settings.message_text_index_settings.embedding_index_settings.min_score + == expected_min_score + ) + assert ( + settings.related_term_index_settings.embedding_index_settings.max_matches == 50 + ) + + +def test_select_best_row_prefers_higher_min_score_on_metric_tie() -> None: + rows = [ + make_row(0.25, 15, 98.5, 0.7514, semantic_score=90.0), + make_row(0.70, 15, 98.5, 0.7514, semantic_score=80.0), + ] + + best_row = select_best_row(rows) + + assert best_row.min_score == 0.70 + assert best_row.max_hits == 15 + + +def test_select_best_row_only_uses_answer_context_after_min_score_tie() -> None: + rows = [ + make_row(0.70, 15, 98.5, 0.7514, semantic_score=80.0), + make_row(0.70, 15, 98.5, 0.7514, semantic_score=90.0), + ] + + best_row = select_best_row(rows) + + assert best_row.answer_metrics is not None + assert best_row.answer_metrics.semantic_score == 90.0 + + +def test_select_best_row_prefers_related_term_quality_before_message_quality() -> None: + rows = [ + make_row( + 0.80, + 15, + 98.5, + 0.90, + related_hit_rate=90.0, + related_mean_reciprocal_rank=0.70, + ), + make_row( + 0.70, + 15, + 98.5, + 0.80, + related_hit_rate=95.0, + related_mean_reciprocal_rank=0.75, + ), + ] + + best_row = select_best_row(rows) + + assert best_row.min_score == 0.70 + + +def test_select_best_row_prefers_pipeline_quality_before_related_term_quality() -> None: + rows = [ + make_row( + 0.80, + 15, + 98.5, + 0.90, + pipeline_hit_rate=90.0, + pipeline_mean_reciprocal_rank=0.70, + related_hit_rate=99.0, + related_mean_reciprocal_rank=0.99, + ), + make_row( + 0.70, + 15, + 98.5, + 0.80, + pipeline_hit_rate=95.0, + pipeline_mean_reciprocal_rank=0.75, + related_hit_rate=90.0, + related_mean_reciprocal_rank=0.70, + ), + ] + + best_row = select_best_row(rows) + + assert best_row.min_score == 0.70 + + +def test_evaluate_related_term_queries_scores_expected_terms() -> None: + vector_base = BENCHMARK_EMBEDDINGS.VectorBase( + BENCHMARK_EMBEDDINGS.TextEmbeddingIndexSettings( + BENCHMARK_EMBEDDINGS.create_embedding_model("test") + ) + ) + vector_base.add_embedding(None, np.array([1.0, 0.0], dtype=np.float32)) + vector_base.add_embedding(None, np.array([0.0, 1.0], dtype=np.float32)) + + metrics = evaluate_related_term_queries( + vector_base, + ["alpha", "beta"], + [RelatedTermQueryCase("query", ["beta"])], + np.array([[0.0, 1.0]], dtype=np.float32), + min_score=0.0, + max_hits=2, + ) + + assert metrics.hit_rate == 100.0 + assert metrics.mean_reciprocal_rank == 1.0 + assert metrics.mean_result_count == 2.0 + + +def test_evaluate_answer_queries_reports_normalized_support_score() -> None: + vector_base = BENCHMARK_EMBEDDINGS.VectorBase( + BENCHMARK_EMBEDDINGS.TextEmbeddingIndexSettings( + BENCHMARK_EMBEDDINGS.create_embedding_model("test") + ) + ) + vector_base.add_embedding(None, np.array([1.0, 0.0], dtype=np.float32)) + answer_cases = [ + BENCHMARK_EMBEDDINGS.AnswerQueryCase( + question="question", + answer="answer", + has_no_answer=False, + ) + ] + + metrics = evaluate_answer_queries( + vector_base, + answer_cases, + np.array([[0.0, 1.0]], dtype=np.float32), + np.array([[0.0, 1.0]], dtype=np.float32), + min_score=0.0, + max_hits=1, + ) + + assert metrics.answerable_support == 0.5 + assert metrics.semantic_score == 75.0 + + +def test_select_best_row_prefers_lower_max_hits_on_full_tie() -> None: + rows = [ + make_row(0.70, 20, 98.5, 0.7514), + make_row(0.70, 15, 98.5, 0.7514), + ] + + best_row = select_best_row(rows) + + assert best_row.min_score == 0.70 + assert best_row.max_hits == 15 + + +def test_parse_float_list_defaults_to_tenth_point_grid() -> None: + assert parse_float_list(None) == [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9] + + +def test_build_float_range_supports_hundredth_point_sweeps() -> None: + assert build_float_range(0.01, 0.05, 0.01) == [0.01, 0.02, 0.03, 0.04, 0.05] + + +def test_resolve_min_scores_uses_generated_range() -> None: + assert resolve_min_scores(None, 0.01, 0.03, 0.01) == [0.01, 0.02, 0.03] + + +def test_resolve_min_scores_rejects_mixed_inputs() -> None: + with pytest.raises(ValueError, match="Use either --min-scores"): + resolve_min_scores("0.1,0.2", 0.01, 0.03, 0.01) + + +def test_load_message_texts_returns_one_text_blob_per_message() -> None: + repo_root = Path(__file__).resolve().parent.parent + + message_texts = load_message_texts(repo_root) + + assert message_texts + assert all(isinstance(text, str) for text in message_texts) + + +def test_load_related_term_texts_returns_fixture_terms() -> None: + repo_root = Path(__file__).resolve().parent.parent + + terms = load_related_term_texts(repo_root) + + assert len(terms) == 1188 + assert "adrian tchaikovsky" in terms + + +def test_load_related_term_queries_returns_compiled_related_terms() -> None: + repo_root = Path(__file__).resolve().parent.parent + + cases = load_related_term_queries(repo_root) + + assert cases + assert all(case.expected_related_terms for case in cases) + + +def test_load_pipeline_queries_strips_cached_related_terms() -> None: + repo_root = Path(__file__).resolve().parent.parent + + cases = load_pipeline_queries(repo_root) + + assert cases + for case in cases: + for obj in BENCHMARK_EMBEDDINGS.iter_dicts( + case.query_exprs[0].__pydantic_serializer__.to_python( + case.query_exprs[0], + by_alias=True, + ) + ): + assert obj.get("relatedTerms") in (None, []) + + +def test_load_message_texts_ignores_serialized_embedding_sidecar( + tmp_path: Path, +) -> None: + testdata_dir = tmp_path / "tests" / "testdata" + testdata_dir.mkdir(parents=True) + (testdata_dir / "Episode_53_AdrianTchaikovsky_index_data.json").write_text( + json.dumps( + { + "messages": [ + {"textChunks": ["hello", "world"]}, + {"textChunks": ["goodbye"]}, + ], + "embeddingFileHeader": { + "messageCount": 2, + "relatedCount": 0, + "modelMetadata": {"embeddingSize": 1536}, + }, + } + ), + encoding="utf-8", + ) + (testdata_dir / "Episode_53_AdrianTchaikovsky_index_embeddings.bin").write_bytes( + b"not real embeddings" + ) + + message_texts = load_message_texts(tmp_path) + + assert message_texts == ["hello world", "goodbye"] + + +def test_load_corpus_metadata_reports_serialized_sidecar_details() -> None: + repo_root = Path(__file__).resolve().parent.parent + + metadata = load_corpus_metadata(repo_root) + + assert metadata.message_count > 0 + assert metadata.serialized_embedding_size == 1536 + assert metadata.serialized_message_count == 106 + assert metadata.serialized_related_count == 1188 + assert metadata.serialized_total_embedding_count == 1294 + + +def test_load_corpus_metadata_rejects_inconsistent_sidecar_size( + tmp_path: Path, +) -> None: + testdata_dir = tmp_path / "tests" / "testdata" + testdata_dir.mkdir(parents=True) + (testdata_dir / "Episode_53_AdrianTchaikovsky_index_data.json").write_text( + json.dumps( + { + "messages": [{"textChunks": ["hello"]}], + "embeddingFileHeader": { + "messageCount": 1, + "relatedCount": 1, + "modelMetadata": {"embeddingSize": 2}, + }, + } + ), + encoding="utf-8", + ) + (testdata_dir / "Episode_53_AdrianTchaikovsky_index_embeddings.bin").write_bytes( + b"bad-sidecar" + ) + + with pytest.raises(ValueError, match="Serialized benchmark sidecar size"): + load_corpus_metadata(tmp_path) diff --git a/tests/test_convsettings.py b/tests/test_convsettings.py new file mode 100644 index 00000000..4063583c --- /dev/null +++ b/tests/test_convsettings.py @@ -0,0 +1,58 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +import numpy as np + +from typeagent.aitools.embeddings import NormalizedEmbedding, NormalizedEmbeddings +from typeagent.knowpro.convsettings import ( + ConversationSettings, + DEFAULT_MESSAGE_TEXT_MIN_SCORE, + DEFAULT_RELATED_TERM_MIN_SCORE, +) + + +class FakeEmbeddingModel: + """Minimal embedding model stub for settings tests.""" + + def __init__(self, model_name: str) -> None: + self.model_name = model_name + + def add_embedding(self, key: str, embedding: NormalizedEmbedding) -> None: + del key, embedding + + async def get_embedding_nocache(self, input: str) -> NormalizedEmbedding: + del input + return np.array([1.0], dtype=np.float32) + + async def get_embeddings_nocache(self, input: list[str]) -> NormalizedEmbeddings: + del input + return np.array([[1.0]], dtype=np.float32) + + async def get_embedding(self, key: str) -> NormalizedEmbedding: + del key + return np.array([1.0], dtype=np.float32) + + async def get_embeddings(self, keys: list[str]) -> NormalizedEmbeddings: + del keys + return np.array([[1.0]], dtype=np.float32) + + +def test_conversation_settings_keep_normal_application_thresholds() -> None: + settings = ConversationSettings(model=FakeEmbeddingModel("text-embedding-3-small")) + + assert ( + settings.related_term_index_settings.embedding_index_settings.min_score + == DEFAULT_RELATED_TERM_MIN_SCORE + ) + assert settings.thread_settings.min_score == DEFAULT_RELATED_TERM_MIN_SCORE + assert ( + settings.message_text_index_settings.embedding_index_settings.min_score + == DEFAULT_MESSAGE_TEXT_MIN_SCORE + ) + assert ( + settings.related_term_index_settings.embedding_index_settings.max_matches == 50 + ) + assert ( + settings.message_text_index_settings.embedding_index_settings.max_matches + is None + ) diff --git a/tests/test_repeat_embedding_benchmarks.py b/tests/test_repeat_embedding_benchmarks.py new file mode 100644 index 00000000..982039ba --- /dev/null +++ b/tests/test_repeat_embedding_benchmarks.py @@ -0,0 +1,60 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +from importlib.util import module_from_spec, spec_from_file_location +from pathlib import Path +import sys + +MODULE_PATH = ( + Path(__file__).resolve().parent.parent / "tools" / "repeat_embedding_benchmarks.py" +) +sys.path.insert(0, str(MODULE_PATH.parent)) +SPEC = spec_from_file_location("repeat_embedding_benchmarks_for_test", MODULE_PATH) +assert SPEC is not None +assert SPEC.loader is not None +REPEAT_BENCHMARKS = module_from_spec(SPEC) +SPEC.loader.exec_module(REPEAT_BENCHMARKS) + +build_run_suite_metadata = REPEAT_BENCHMARKS.build_run_suite_metadata + + +def test_build_run_suite_metadata_records_ignored_sidecar() -> None: + repo_root = Path(__file__).resolve().parent.parent + + metadata = build_run_suite_metadata( + repo_root=repo_root, + timestamp="20260424T000000Z", + models=["openai:text-embedding-3-small"], + runs=3, + min_scores=[0.01, 0.02], + max_hits_values=[5, 10], + batch_size=16, + ) + + assert metadata["ignored_serialized_embedding_size"] == 1536 + assert metadata["ignored_serialized_message_embedding_count"] == 106 + assert metadata["ignored_serialized_related_embedding_count"] == 1188 + assert metadata["ignored_serialized_total_embedding_count"] == 1294 + assert metadata["pipeline_source_json"] == ( + "tests/testdata/Episode_53_Search_results.json" + ) + assert metadata["related_term_source_json"] == ( + "tests/testdata/Episode_53_Search_results.json" + ) + assert metadata["pipeline_scoring_paths"] == [ + "src/typeagent/knowpro/search.py::run_search_query", + "src/typeagent/knowpro/query.py::MatchSearchTermExpr.accumulate_matches_for_term", + "src/typeagent/knowpro/collections.py::SemanticRefAccumulator.add_term_matches", + "src/typeagent/knowpro/collections.py::add_smooth_related_score_to_match_score", + "src/typeagent/knowpro/query.py::message_matches_from_knowledge_matches", + "src/typeagent/knowpro/collections.py::MessageAccumulator.smooth_scores", + ] + assert metadata["corpus_embedding_source"] == ( + "recomputed_per_model_from_message_text" + ) + assert metadata["query_embedding_source"] == ( + "recomputed_per_model_from_search_text" + ) + assert metadata["min_score_count"] == 2 + assert metadata["max_hits_count"] == 2 + assert metadata["grid_row_count"] == 4 diff --git a/tests/test_vectorbase.py b/tests/test_vectorbase.py index 81ccecc6..763937b1 100644 --- a/tests/test_vectorbase.py +++ b/tests/test_vectorbase.py @@ -7,11 +7,42 @@ from typeagent.aitools.embeddings import ( CachingEmbeddingModel, NormalizedEmbedding, + NormalizedEmbeddings, ) from typeagent.aitools.model_adapters import ( create_test_embedding_model, ) -from typeagent.aitools.vectorbase import TextEmbeddingIndexSettings, VectorBase +from typeagent.aitools.vectorbase import ( + DEFAULT_MIN_SCORE, + TextEmbeddingIndexSettings, + VectorBase, +) + + +class FakeEmbeddingModel: + """Minimal embedding model stub for settings tests.""" + + def __init__(self, model_name: str) -> None: + self.model_name = model_name + + def add_embedding(self, key: str, embedding: NormalizedEmbedding) -> None: + del key, embedding + + async def get_embedding_nocache(self, input: str) -> NormalizedEmbedding: + del input + return np.array([1.0], dtype=np.float32) + + async def get_embeddings_nocache(self, input: list[str]) -> NormalizedEmbeddings: + del input + return np.array([[1.0]], dtype=np.float32) + + async def get_embedding(self, key: str) -> NormalizedEmbedding: + del key + return np.array([1.0], dtype=np.float32) + + async def get_embeddings(self, keys: list[str]) -> NormalizedEmbeddings: + del keys + return np.array([[1.0]], dtype=np.float32) @pytest.fixture(scope="function") @@ -38,7 +69,7 @@ def sample_embeddings() -> Samples: } -def test_add_embedding(vector_base: VectorBase, sample_embeddings: Samples): +def test_add_embedding(vector_base: VectorBase, sample_embeddings: Samples) -> None: """Test adding embeddings to the VectorBase.""" for key, embedding in sample_embeddings.items(): vector_base.add_embedding(key, embedding) @@ -48,7 +79,7 @@ def test_add_embedding(vector_base: VectorBase, sample_embeddings: Samples): np.testing.assert_array_equal(vector_base.serialize_embedding_at(i), embedding) -def test_add_embeddings(vector_base: VectorBase, sample_embeddings: Samples): +def test_add_embeddings(vector_base: VectorBase, sample_embeddings: Samples) -> None: """Adding multiple embeddings at once matches repeated single adds.""" keys = list(sample_embeddings.keys()) for key, embedding in sample_embeddings.items(): @@ -71,7 +102,7 @@ def test_add_embeddings(vector_base: VectorBase, sample_embeddings: Samples): @pytest.mark.asyncio -async def test_add_key(vector_base: VectorBase, sample_embeddings: Samples): +async def test_add_key(vector_base: VectorBase, sample_embeddings: Samples) -> None: """Test adding keys to the VectorBase.""" for key in sample_embeddings: await vector_base.add_key(key) @@ -80,7 +111,9 @@ async def test_add_key(vector_base: VectorBase, sample_embeddings: Samples): @pytest.mark.asyncio -async def test_add_key_no_cache(vector_base: VectorBase, sample_embeddings: Samples): +async def test_add_key_no_cache( + vector_base: VectorBase, sample_embeddings: Samples +) -> None: """Test adding keys to the VectorBase with cache disabled.""" for key in sample_embeddings: await vector_base.add_key(key, cache=False) @@ -91,7 +124,7 @@ async def test_add_key_no_cache(vector_base: VectorBase, sample_embeddings: Samp @pytest.mark.asyncio -async def test_add_keys(vector_base: VectorBase, sample_embeddings: Samples): +async def test_add_keys(vector_base: VectorBase, sample_embeddings: Samples) -> None: """Test adding multiple keys to the VectorBase.""" keys = list(sample_embeddings.keys()) await vector_base.add_keys(keys) @@ -100,7 +133,9 @@ async def test_add_keys(vector_base: VectorBase, sample_embeddings: Samples): @pytest.mark.asyncio -async def test_add_keys_no_cache(vector_base: VectorBase, sample_embeddings: Samples): +async def test_add_keys_no_cache( + vector_base: VectorBase, sample_embeddings: Samples +) -> None: """Test adding multiple keys to the VectorBase with cache disabled.""" keys = list(sample_embeddings.keys()) await vector_base.add_keys(keys, cache=False) @@ -111,7 +146,9 @@ async def test_add_keys_no_cache(vector_base: VectorBase, sample_embeddings: Sam @pytest.mark.asyncio -async def test_fuzzy_lookup(vector_base: VectorBase, sample_embeddings: Samples): +async def test_fuzzy_lookup( + vector_base: VectorBase, sample_embeddings: Samples +) -> None: """Test fuzzy lookup functionality.""" for key in sample_embeddings: await vector_base.add_key(key) @@ -122,7 +159,7 @@ async def test_fuzzy_lookup(vector_base: VectorBase, sample_embeddings: Samples) assert results[0].score > 0.9 # High similarity score for the same word -def test_clear(vector_base: VectorBase, sample_embeddings: Samples): +def test_clear(vector_base: VectorBase, sample_embeddings: Samples) -> None: """Test clearing the VectorBase.""" for key, embedding in sample_embeddings.items(): vector_base.add_embedding(key, embedding) @@ -132,7 +169,9 @@ def test_clear(vector_base: VectorBase, sample_embeddings: Samples): assert len(vector_base) == 0 -def test_serialize_deserialize(vector_base: VectorBase, sample_embeddings: Samples): +def test_serialize_deserialize( + vector_base: VectorBase, sample_embeddings: Samples +) -> None: """Test serialization and deserialization of the VectorBase.""" for key, embedding in sample_embeddings.items(): vector_base.add_embedding(key, embedding) @@ -149,12 +188,12 @@ def test_serialize_deserialize(vector_base: VectorBase, sample_embeddings: Sampl ) -def test_vectorbase_bool(vector_base: VectorBase): +def test_vectorbase_bool(vector_base: VectorBase) -> None: """__bool__ should always return True.""" assert bool(vector_base) is True -def test_get_embedding_at(vector_base: VectorBase, sample_embeddings: Samples): +def test_get_embedding_at(vector_base: VectorBase, sample_embeddings: Samples) -> None: """Test get_embedding_at returns correct embedding and raises IndexError.""" for key, embedding in sample_embeddings.items(): vector_base.add_embedding(key, embedding) @@ -169,7 +208,7 @@ def test_get_embedding_at(vector_base: VectorBase, sample_embeddings: Samples): def test_fuzzy_lookup_embedding_in_subset( vector_base: VectorBase, sample_embeddings: Samples -): +) -> None: """Test fuzzy_lookup_embedding_in_subset returns best match in subset or None.""" keys = list(sample_embeddings.keys()) for key, embedding in sample_embeddings.items(): @@ -197,6 +236,22 @@ def test_fuzzy_lookup_embedding_in_subset( assert result == [] +def test_fuzzy_lookup_embedding_reports_normalized_score_scale() -> None: + vector_base = make_vector_base() + vector_base.add_embedding(None, np.array([1.0, 0.0], dtype=np.float32)) + vector_base.add_embedding(None, np.array([0.0, 1.0], dtype=np.float32)) + vector_base.add_embedding(None, np.array([-1.0, 0.0], dtype=np.float32)) + + results = vector_base.fuzzy_lookup_embedding( + np.array([1.0, 0.0], dtype=np.float32), + max_hits=3, + min_score=0.0, + ) + + assert [result.item for result in results] == [0, 1, 2] + assert [result.score for result in results] == [1.0, 0.5, 0.0] + + def test_add_embedding_size_mismatch(vector_base: VectorBase) -> None: """Adding an embedding of wrong size raises ValueError.""" emb3 = np.array([0.1, 0.2, 0.3], dtype=np.float32) @@ -220,3 +275,51 @@ def test_add_embeddings_wrong_ndim(vector_base: VectorBase) -> None: emb1d = np.array([0.1, 0.2, 0.3], dtype=np.float32) with pytest.raises(ValueError, match="Expected 2D"): vector_base.add_embeddings(None, emb1d) + + +@pytest.mark.parametrize( + ("model_name", "expected_min_score"), + [ + ("text-embedding-3-large", 0.74), + ("text-embedding-3-small", 0.73), + ("text-embedding-ada-002", 0.93), + ], +) +def test_text_embedding_index_settings_uses_known_model_default( + model_name: str, expected_min_score: float +) -> None: + settings = TextEmbeddingIndexSettings( + embedding_model=FakeEmbeddingModel(model_name) + ) + + assert settings.min_score == expected_min_score + assert settings.max_matches is None + + +def test_text_embedding_index_settings_keeps_unknown_model_fallback() -> None: + settings = TextEmbeddingIndexSettings( + embedding_model=FakeEmbeddingModel("custom-embedding-model") + ) + + assert settings.min_score == DEFAULT_MIN_SCORE + assert settings.max_matches is None + + +def test_text_embedding_index_settings_explicit_overrides_win() -> None: + settings = TextEmbeddingIndexSettings( + embedding_model=FakeEmbeddingModel("text-embedding-3-large"), + min_score=0.55, + max_matches=7, + ) + + assert settings.min_score == 0.55 + assert settings.max_matches == 7 + + +def test_text_embedding_index_settings_invalid_max_matches_becomes_none() -> None: + settings = TextEmbeddingIndexSettings( + embedding_model=FakeEmbeddingModel("text-embedding-3-large"), + max_matches=0, + ) + + assert settings.max_matches is None diff --git a/tools/benchmark_embeddings.py b/tools/benchmark_embeddings.py new file mode 100644 index 00000000..96d1c036 --- /dev/null +++ b/tools/benchmark_embeddings.py @@ -0,0 +1,1133 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Benchmark retrieval settings for known embedding models. + +This script evaluates the Adrian Tchaikovsky Episode 53 search dataset in +`tests/testdata/` and reports retrieval quality for combinations of +`min_score` and `max_hits`. + +Methodology: +- Load only message text from the benchmark `_data.json` payload. +- Treat the serialized `_embeddings.bin` sidecar as metadata only. +- Recompute corpus and query embeddings with the requested model. + +The benchmark is intentionally narrow: +- It only measures retrieval against `messageMatches` ground truth. +- It is meant to help choose repository defaults for known models. +- In practice, `min_score` is the primary library default this informs. +- It does not prove universal "best" settings for every dataset. +It also includes a semantic answer-context signal from the answer fixture: +- Answerable questions should retrieve messages close to the expected answer. +- No-answer questions should avoid high-confidence retrieved context. + +Usage: + uv run python tools/benchmark_embeddings.py + uv run python tools/benchmark_embeddings.py --model openai:text-embedding-3-small + uv run python tools/benchmark_embeddings.py --model openai:text-embedding-3-small --min-score-start 0.01 --min-score-stop 0.20 --min-score-step 0.01 +""" + +import argparse +import asyncio +from copy import deepcopy +from dataclasses import dataclass +from decimal import Decimal +import json +from pathlib import Path +from statistics import mean + +from dotenv import load_dotenv +import numpy as np + +from typeagent.aitools.embeddings import ( + IEmbeddingModel, + NormalizedEmbeddings, +) +from typeagent.aitools.model_adapters import create_embedding_model +from typeagent.aitools.vectorbase import ( + TextEmbeddingIndexSettings, + VectorBase, +) +from typeagent.knowpro import search, secindex, serialization +from typeagent.knowpro.convsettings import ( + ConversationSettings, + MessageTextIndexSettings, + RelatedTermIndexSettings, +) +from typeagent.podcasts import podcast + +DEFAULT_MIN_SCORES = [score / 10 for score in range(1, 10)] +DEFAULT_MAX_HITS = [5, 10, 15, 20] +DATA_DIR = Path("tests") / "testdata" +INDEX_PREFIX_PATH = DATA_DIR / "Episode_53_AdrianTchaikovsky_index" +INDEX_DATA_PATH = DATA_DIR / "Episode_53_AdrianTchaikovsky_index_data.json" +INDEX_EMBEDDINGS_PATH = DATA_DIR / "Episode_53_AdrianTchaikovsky_index_embeddings.bin" +SEARCH_RESULTS_PATH = DATA_DIR / "Episode_53_Search_results.json" +ANSWER_RESULTS_PATH = DATA_DIR / "Episode_53_Answer_results.json" +CORPUS_EMBEDDING_SOURCE = "recomputed_per_model_from_message_text" +QUERY_EMBEDDING_SOURCE = "recomputed_per_model_from_search_text" +ANSWER_EMBEDDING_SOURCE = "recomputed_per_model_from_expected_answer_text" +PIPELINE_SCORING_PATHS = [ + "src/typeagent/knowpro/search.py::run_search_query", + "src/typeagent/knowpro/query.py::MatchSearchTermExpr.accumulate_matches_for_term", + "src/typeagent/knowpro/collections.py::SemanticRefAccumulator.add_term_matches", + "src/typeagent/knowpro/collections.py::add_smooth_related_score_to_match_score", + "src/typeagent/knowpro/query.py::message_matches_from_knowledge_matches", + "src/typeagent/knowpro/collections.py::MessageAccumulator.smooth_scores", +] + + +def score_from_cosine(cosine_similarity: np.ndarray) -> np.ndarray: + """Map cosine similarity from -1..1 to the public 0..1 score scale.""" + + return np.clip((cosine_similarity + 1.0) / 2.0, 0.0, 1.0) + + +@dataclass +class SearchQueryCase: + """A benchmark query paired with the message ordinals it should retrieve.""" + + query: str + expected_matches: list[int] + + +@dataclass +class SearchMetrics: + """Aggregate retrieval quality metrics for one benchmark row.""" + + hit_rate: float + mean_reciprocal_rank: float + + +@dataclass +class PipelineQueryCase: + """A compiled query fixture with message-level ground truth.""" + + query: str + query_exprs: list[search.SearchQueryExpr] + expected_matches: list[int] + + +@dataclass +class PipelineMetrics: + """Aggregate metrics from the real query scoring pipeline.""" + + hit_rate: float + mean_reciprocal_rank: float + mean_result_count: float + + +@dataclass +class RelatedTermQueryCase: + """A search term paired with related terms from the compiled query fixture.""" + + term: str + expected_related_terms: list[str] + + +@dataclass +class RelatedTermMetrics: + """Aggregate fuzzy related-term retrieval metrics for one benchmark row.""" + + hit_rate: float + mean_reciprocal_rank: float + mean_result_count: float + + +@dataclass +class AnswerQueryCase: + """A benchmark answer case paired with its expected answerability.""" + + question: str + answer: str + has_no_answer: bool + + +@dataclass +class AnswerMetrics: + """Aggregate semantic answer-context metrics for one benchmark row.""" + + answerable_support: float + no_answer_rejection_rate: float + semantic_score: float + + +@dataclass +class TopScoreStats: + """Observed top-1 score statistics across all benchmark queries.""" + + min_top_score: float + mean_top_score: float + max_top_score: float + + +@dataclass +class BenchmarkRow: + """One `(min_score, max_hits)` configuration evaluated by the benchmark.""" + + min_score: float + max_hits: int + metrics: SearchMetrics + pipeline_metrics: PipelineMetrics | None = None + related_metrics: RelatedTermMetrics | None = None + answer_metrics: AnswerMetrics | None = None + + +@dataclass +class CorpusMetadata: + """Metadata about the serialized benchmark corpus fixture.""" + + message_count: int + serialized_embedding_size: int | None + serialized_message_count: int | None + serialized_related_count: int | None + serialized_total_embedding_count: int | None + + +def parse_float_list(raw: str | None) -> list[float]: + """Parse explicit min-score values or fall back to the coarse default grid.""" + + if raw is None: + return DEFAULT_MIN_SCORES + values = [float(item.strip()) for item in raw.split(",") if item.strip()] + if not values: + raise ValueError("--min-scores must contain at least one value") + return values + + +def build_float_range(start: float, stop: float, step: float) -> list[float]: + """Build an inclusive decimal-safe float range for score sweeps.""" + + if step <= 0: + raise ValueError("--min-score-step must be positive") + if start > stop: + raise ValueError("--min-score-start must be <= --min-score-stop") + + start_decimal = Decimal(str(start)) + stop_decimal = Decimal(str(stop)) + step_decimal = Decimal(str(step)) + values: list[float] = [] + current = start_decimal + while current <= stop_decimal: + values.append(float(current)) + current += step_decimal + return values + + +def resolve_min_scores( + raw: str | None, + start: float | None, + stop: float | None, + step: float | None, +) -> list[float]: + """Resolve the benchmark min-score grid from explicit values or a generated range.""" + + range_args = [start, stop, step] + using_range = any(value is not None for value in range_args) + if using_range: + if raw is not None: + raise ValueError( + "Use either --min-scores or the --min-score-start/stop/step range" + ) + if any(value is None for value in range_args): + raise ValueError( + "--min-score-start, --min-score-stop, and --min-score-step must all be set together" + ) + assert start is not None + assert stop is not None + assert step is not None + return build_float_range(start, stop, step) + return parse_float_list(raw) + + +def parse_int_list(raw: str | None) -> list[int]: + """Parse positive integer arguments such as `max_hits` grids.""" + + if raw is None: + return DEFAULT_MAX_HITS + values = [int(item.strip()) for item in raw.split(",") if item.strip()] + if not values: + raise ValueError("--max-hits must contain at least one value") + if any(value <= 0 for value in values): + raise ValueError("--max-hits values must be positive integers") + return values + + +def load_message_texts(repo_root: Path) -> list[str]: + """Load the benchmark corpus as one text blob per message. + + The JSON fixture also points at a serialized embedding sidecar, but that + sidecar is deliberately ignored here. Cross-model comparisons are only + meaningful when every evaluated model embeds the same raw message text. + """ + + index_data = json.loads((repo_root / INDEX_DATA_PATH).read_text(encoding="utf-8")) + messages = index_data["messages"] + return [" ".join(message.get("textChunks", [])) for message in messages] + + +def load_related_term_texts(repo_root: Path) -> list[str]: + """Load the term corpus used by fuzzy related-term lookup.""" + + index_data = json.loads((repo_root / INDEX_DATA_PATH).read_text(encoding="utf-8")) + related_terms_index_data = index_data.get("relatedTermsIndexData") or {} + text_embedding_data = related_terms_index_data.get("textEmbeddingData") or {} + text_items = text_embedding_data.get("textItems") + if isinstance(text_items, list) and text_items: + return [text for text in text_items if isinstance(text, str)] + + semantic_index_data = index_data.get("semanticIndexData") or {} + items = semantic_index_data.get("items") or [] + return [item["term"] for item in items if isinstance(item.get("term"), str)] + + +def load_corpus_metadata(repo_root: Path) -> CorpusMetadata: + """Load sidecar metadata without loading the sidecar embeddings.""" + + index_data = json.loads((repo_root / INDEX_DATA_PATH).read_text(encoding="utf-8")) + embedding_file_header = index_data.get("embeddingFileHeader") or {} + model_metadata = embedding_file_header.get("modelMetadata") or {} + serialized_embedding_size = model_metadata.get("embeddingSize") + serialized_message_count = embedding_file_header.get("messageCount") + serialized_related_count = embedding_file_header.get("relatedCount") + serialized_total_embedding_count: int | None = None + + sidecar_path = repo_root / INDEX_EMBEDDINGS_PATH + if serialized_embedding_size is not None and sidecar_path.exists(): + bytes_per_embedding = serialized_embedding_size * np.dtype(np.float32).itemsize + if bytes_per_embedding <= 0: + raise ValueError( + "Serialized benchmark corpus has a non-positive embedding size" + ) + sidecar_size_bytes = sidecar_path.stat().st_size + if sidecar_size_bytes % bytes_per_embedding != 0: + raise ValueError( + "Serialized benchmark sidecar size is not divisible by the declared " + f"embedding width of {serialized_embedding_size}" + ) + serialized_total_embedding_count = sidecar_size_bytes // bytes_per_embedding + declared_total_count = (serialized_message_count or 0) + ( + serialized_related_count or 0 + ) + if ( + declared_total_count + and declared_total_count != serialized_total_embedding_count + ): + raise ValueError( + "Serialized benchmark sidecar row count does not match the counts " + "declared in the JSON metadata" + ) + + return CorpusMetadata( + message_count=len(index_data.get("messages", [])), + serialized_embedding_size=serialized_embedding_size, + serialized_message_count=serialized_message_count, + serialized_related_count=serialized_related_count, + serialized_total_embedding_count=serialized_total_embedding_count, + ) + + +def load_search_queries(repo_root: Path) -> list[SearchQueryCase]: + """Load benchmark queries that include message-level ground-truth matches.""" + + search_data = json.loads( + (repo_root / SEARCH_RESULTS_PATH).read_text(encoding="utf-8") + ) + cases: list[SearchQueryCase] = [] + for item in search_data: + search_text = item.get("searchText") + results = item.get("results", []) + if not search_text or not results: + continue + expected_matches = results[0].get("messageMatches", []) + if not expected_matches: + continue + cases.append(SearchQueryCase(search_text, expected_matches)) + return cases + + +def strip_related_terms(value: object) -> None: + """Remove cached related-term expansions so each model resolves its own.""" + + for obj in iter_dicts(value): + related_terms = obj.get("relatedTerms") + if isinstance(related_terms, list) and related_terms: + obj["relatedTerms"] = None + + +def load_pipeline_queries(repo_root: Path) -> list[PipelineQueryCase]: + """Load compiled query fixtures for the real semantic scoring pipeline.""" + + search_data = json.loads( + (repo_root / SEARCH_RESULTS_PATH).read_text(encoding="utf-8") + ) + cases: list[PipelineQueryCase] = [] + for item in search_data: + search_text = item.get("searchText") + compiled_query_expr = item.get("compiledQueryExpr") + results = item.get("results", []) + if not ( + isinstance(search_text, str) + and isinstance(compiled_query_expr, list) + and results + ): + continue + expected_matches = results[0].get("messageMatches", []) + if not expected_matches: + continue + strip_related_terms(compiled_query_expr) + query_exprs = serialization.deserialize_object( + list[search.SearchQueryExpr], + compiled_query_expr, + ) + cases.append( + PipelineQueryCase( + query=search_text, + query_exprs=query_exprs, + expected_matches=expected_matches, + ) + ) + return cases + + +def iter_dicts(value: object): + """Yield dictionaries recursively from a decoded JSON value.""" + + if isinstance(value, dict): + yield value + for child in value.values(): + yield from iter_dicts(child) + elif isinstance(value, list): + for child in value: + yield from iter_dicts(child) + + +def load_related_term_queries(repo_root: Path) -> list[RelatedTermQueryCase]: + """Load expected fuzzy related-term outputs from compiled query fixtures. + + These compiled fixtures are closer to the real query pipeline than raw + query-to-message similarity: `min_score` normally gates fuzzy related-term + expansion before semantic-ref and message scores are accumulated. + """ + + search_data = json.loads( + (repo_root / SEARCH_RESULTS_PATH).read_text(encoding="utf-8") + ) + cases: list[RelatedTermQueryCase] = [] + seen: set[tuple[str, tuple[str, ...]]] = set() + for item in search_data: + for obj in iter_dicts(item.get("compiledQueryExpr", [])): + term = obj.get("term") + related_terms = obj.get("relatedTerms") + if not ( + isinstance(term, dict) + and isinstance(term.get("text"), str) + and isinstance(related_terms, list) + and related_terms + ): + continue + expected: list[str] = [] + for related in related_terms: + if isinstance(related, dict): + related_text = related.get("text") + if isinstance(related_text, str): + expected.append(related_text) + if not expected: + continue + key = (term["text"], tuple(expected)) + if key not in seen: + seen.add(key) + cases.append(RelatedTermQueryCase(term["text"], expected)) + return cases + + +def load_answer_queries(repo_root: Path) -> list[AnswerQueryCase]: + """Load expected answers for semantic answer-context benchmarking.""" + + answer_data = json.loads( + (repo_root / ANSWER_RESULTS_PATH).read_text(encoding="utf-8") + ) + cases: list[AnswerQueryCase] = [] + for item in answer_data: + question = item.get("question") + answer = item.get("answer") + has_no_answer = item.get("hasNoAnswer") + if ( + isinstance(question, str) + and isinstance(answer, str) + and isinstance(has_no_answer, bool) + ): + cases.append(AnswerQueryCase(question, answer, has_no_answer)) + return cases + + +async def build_vector_base( + model_spec: str | None, + message_texts: list[str], + batch_size: int, +) -> tuple[IEmbeddingModel, VectorBase]: + """Build a message-level vector index for the benchmark corpus. + + This computes fresh embeddings for `message_texts` with the requested + model. It does not deserialize or consult the fixture's `_embeddings.bin` + sidecar, which may have been generated by a different embedding model. + """ + + model = create_embedding_model(model_spec) + vector_base = await build_text_vector_base(model, message_texts, batch_size) + return model, vector_base + + +async def build_pipeline_conversation( + repo_root: Path, + model: IEmbeddingModel, +) -> podcast.Podcast: + """Build the benchmark conversation with per-model secondary indexes. + + The fixture's serialized embedding sidecar is deliberately not used here. + We keep the semantic refs and exact semantic index, then rebuild related-term + and message-text indexes with the requested model so `min_score` gates the + same fuzzy expansion path the runtime uses. + """ + + settings = create_benchmark_conversation_settings(model) + data = podcast.Podcast._read_conversation_data_from_file( + str(repo_root / INDEX_PREFIX_PATH) + ) + data.pop("relatedTermsIndexData", None) + data.pop("messageIndexData", None) + conversation = await podcast.Podcast.create(settings) + await conversation.deserialize(data) + await secindex.build_secondary_indexes(conversation, settings) + return conversation + + +def create_benchmark_conversation_settings( + model: IEmbeddingModel, +) -> ConversationSettings: + """Use benchmarked model defaults without changing normal app settings.""" + + settings = ConversationSettings(model=model) + benchmark_min_score = TextEmbeddingIndexSettings(model).min_score + settings.related_term_index_settings = RelatedTermIndexSettings( + TextEmbeddingIndexSettings( + model, + min_score=benchmark_min_score, + max_matches=50, + ) + ) + settings.thread_settings = TextEmbeddingIndexSettings( + model, + min_score=benchmark_min_score, + ) + settings.message_text_index_settings = MessageTextIndexSettings( + TextEmbeddingIndexSettings( + model, + min_score=benchmark_min_score, + ) + ) + return settings + + +async def build_text_vector_base( + model: IEmbeddingModel, + texts: list[str], + batch_size: int, +) -> VectorBase: + """Build a vector index for already selected benchmark text items.""" + + settings = TextEmbeddingIndexSettings( + embedding_model=model, + min_score=0.0, + max_matches=None, + batch_size=batch_size, + ) + vector_base = VectorBase(settings) + for start in range(0, len(texts), batch_size): + batch = texts[start : start + batch_size] + embeddings = await model.get_embeddings_nocache(batch) + vector_base.add_embeddings(None, embeddings) + return vector_base + + +def evaluate_search_queries( + vector_base: VectorBase, + query_cases: list[SearchQueryCase], + query_embeddings: NormalizedEmbeddings, + min_score: float, + max_hits: int, +) -> SearchMetrics: + """Evaluate one benchmark row over every labeled query.""" + + hit_count = 0 + reciprocal_ranks: list[float] = [] + + for case, query_embedding in zip(query_cases, query_embeddings): + scored_results = vector_base.fuzzy_lookup_embedding( + query_embedding, + max_hits=max_hits, + min_score=min_score, + ) + rank = 0 + for result_index, scored_result in enumerate(scored_results, start=1): + if scored_result.item in case.expected_matches: + rank = result_index + break + if rank > 0: + hit_count += 1 + reciprocal_ranks.append(1.0 / rank) + else: + reciprocal_ranks.append(0.0) + + return SearchMetrics( + hit_rate=(hit_count / len(query_cases)) * 100, + mean_reciprocal_rank=mean(reciprocal_ranks), + ) + + +async def evaluate_pipeline_queries( + conversation: podcast.Podcast, + query_cases: list[PipelineQueryCase], + min_score: float, + max_hits: int, +) -> PipelineMetrics: + """Evaluate compiled queries through the runtime semantic scoring path.""" + + related_settings = ( + conversation.settings.related_term_index_settings.embedding_index_settings + ) + related_settings.min_score = min_score + hit_count = 0 + reciprocal_ranks: list[float] = [] + result_counts: list[int] = [] + options = search.SearchOptions(max_message_matches=max_hits) + + for case in query_cases: + query_exprs = deepcopy(case.query_exprs) + scored_results = [] + for query_expr in query_exprs: + search_results = await search.run_search_query( + conversation, + query_expr, + options, + ) + for result in search_results: + scored_results.extend(result.message_matches) + scored_results.sort(key=lambda result: result.score, reverse=True) + result_counts.append(len(scored_results)) + expected_matches = set(case.expected_matches) + rank = 0 + for result_index, scored_result in enumerate(scored_results, start=1): + if scored_result.message_ordinal in expected_matches: + rank = result_index + break + if rank > 0: + hit_count += 1 + reciprocal_ranks.append(1.0 / rank) + else: + reciprocal_ranks.append(0.0) + + return PipelineMetrics( + hit_rate=(hit_count / len(query_cases)) * 100, + mean_reciprocal_rank=mean(reciprocal_ranks), + mean_result_count=mean(result_counts), + ) + + +def evaluate_related_term_queries( + vector_base: VectorBase, + related_terms: list[str], + query_cases: list[RelatedTermQueryCase], + query_embeddings: NormalizedEmbeddings, + min_score: float, + max_hits: int, +) -> RelatedTermMetrics: + """Evaluate fuzzy related-term retrieval against compiled query fixtures.""" + + hit_count = 0 + reciprocal_ranks: list[float] = [] + result_counts: list[int] = [] + + for case, query_embedding in zip(query_cases, query_embeddings): + expected_terms = set(case.expected_related_terms) + scored_results = vector_base.fuzzy_lookup_embedding( + query_embedding, + max_hits=max_hits, + min_score=min_score, + ) + result_counts.append(len(scored_results)) + rank = 0 + for result_index, scored_result in enumerate(scored_results, start=1): + if related_terms[scored_result.item] in expected_terms: + rank = result_index + break + if rank > 0: + hit_count += 1 + reciprocal_ranks.append(1.0 / rank) + else: + reciprocal_ranks.append(0.0) + + return RelatedTermMetrics( + hit_rate=(hit_count / len(query_cases)) * 100, + mean_reciprocal_rank=mean(reciprocal_ranks), + mean_result_count=mean(result_counts), + ) + + +def evaluate_answer_queries( + vector_base: VectorBase, + answer_cases: list[AnswerQueryCase], + question_embeddings: NormalizedEmbeddings, + answer_embeddings: NormalizedEmbeddings, + min_score: float, + max_hits: int, +) -> AnswerMetrics: + """Evaluate whether retrieved message context semantically supports answers.""" + + answerable_support_scores: list[float] = [] + no_answer_rejections = 0 + no_answer_count = 0 + corpus_embeddings = vector_base.serialize() + + for case, question_embedding, answer_embedding in zip( + answer_cases, + question_embeddings, + answer_embeddings, + strict=True, + ): + scored_results = vector_base.fuzzy_lookup_embedding( + question_embedding, + max_hits=max_hits, + min_score=min_score, + ) + if case.has_no_answer: + no_answer_count += 1 + if not scored_results: + no_answer_rejections += 1 + continue + + if not scored_results: + answerable_support_scores.append(0.0) + continue + + retrieved_embeddings = corpus_embeddings[ + [scored_result.item for scored_result in scored_results] + ] + scores = score_from_cosine(np.dot(retrieved_embeddings, answer_embedding)) + answerable_support_scores.append(float(np.max(scores))) + + answerable_support = ( + mean(answerable_support_scores) if answerable_support_scores else 0.0 + ) + no_answer_rejection_rate = ( + (no_answer_rejections / no_answer_count) * 100 if no_answer_count else 100.0 + ) + semantic_score = (answerable_support * 100 + no_answer_rejection_rate) / 2 + return AnswerMetrics( + answerable_support=answerable_support, + no_answer_rejection_rate=no_answer_rejection_rate, + semantic_score=semantic_score, + ) + + +async def evaluate_grid( + vector_base: VectorBase, + query_cases: list[SearchQueryCase], + query_embeddings: NormalizedEmbeddings, + min_scores: list[float], + max_hits_values: list[int], + pipeline_conversation: podcast.Podcast | None = None, + pipeline_query_cases: list[PipelineQueryCase] | None = None, + related_vector_base: VectorBase | None = None, + related_terms: list[str] | None = None, + related_query_cases: list[RelatedTermQueryCase] | None = None, + related_query_embeddings: NormalizedEmbeddings | None = None, + answer_cases: list[AnswerQueryCase] | None = None, + answer_question_embeddings: NormalizedEmbeddings | None = None, + answer_embeddings: NormalizedEmbeddings | None = None, + progress_label: str | None = None, +) -> list[BenchmarkRow]: + """Evaluate every `(min_score, max_hits)` row in the requested grid.""" + + rows: list[BenchmarkRow] = [] + for min_score_index, min_score in enumerate(min_scores, start=1): + if progress_label and ( + min_score_index == 1 + or min_score_index == len(min_scores) + or min_score_index % 10 == 0 + ): + print( + f"{progress_label}: min_score {min_score:.2f} " + f"({min_score_index}/{len(min_scores)})...", + flush=True, + ) + for max_hits in max_hits_values: + metrics = evaluate_search_queries( + vector_base, + query_cases, + query_embeddings, + min_score, + max_hits, + ) + pipeline_metrics = None + if pipeline_conversation is not None and pipeline_query_cases is not None: + pipeline_metrics = await evaluate_pipeline_queries( + pipeline_conversation, + pipeline_query_cases, + min_score, + max_hits, + ) + related_metrics = None + if ( + related_vector_base is not None + and related_terms is not None + and related_query_cases is not None + and related_query_embeddings is not None + ): + related_metrics = evaluate_related_term_queries( + related_vector_base, + related_terms, + related_query_cases, + related_query_embeddings, + min_score, + max_hits, + ) + answer_metrics = None + if ( + answer_cases is not None + and answer_question_embeddings is not None + and answer_embeddings is not None + ): + answer_metrics = evaluate_answer_queries( + vector_base, + answer_cases, + answer_question_embeddings, + answer_embeddings, + min_score, + max_hits, + ) + rows.append( + BenchmarkRow( + min_score, + max_hits, + metrics, + pipeline_metrics, + related_metrics, + answer_metrics, + ) + ) + return rows + + +def measure_top_score_stats( + vector_base: VectorBase, + query_embeddings: NormalizedEmbeddings, +) -> TopScoreStats: + """Measure the achievable top-1 score range for the current model and corpus.""" + + top_scores: list[float] = [] + for query_embedding in query_embeddings: + scored_results = vector_base.fuzzy_lookup_embedding( + query_embedding, + max_hits=1, + min_score=0.0, + ) + top_scores.append(scored_results[0].score if scored_results else 0.0) + + return TopScoreStats( + min_top_score=min(top_scores), + mean_top_score=mean(top_scores), + max_top_score=max(top_scores), + ) + + +def filter_min_scores_by_ceiling( + min_scores: list[float], max_top_score: float +) -> tuple[list[float], list[float]]: + """Keep the requested min-score grid intact.""" + + _ = max_top_score + return list(min_scores), [] + + +def select_best_row(rows: list[BenchmarkRow]) -> BenchmarkRow: + """Prefer true pipeline quality, then related-term quality and strictness.""" + + return max( + rows, + key=lambda row: ( + row.pipeline_metrics.mean_reciprocal_rank if row.pipeline_metrics else 0.0, + row.pipeline_metrics.hit_rate if row.pipeline_metrics else 0.0, + row.related_metrics.mean_reciprocal_rank if row.related_metrics else 0.0, + row.related_metrics.hit_rate if row.related_metrics else 0.0, + row.metrics.mean_reciprocal_rank, + row.metrics.hit_rate, + row.min_score, + row.answer_metrics.semantic_score if row.answer_metrics else 0.0, + -row.max_hits, + ), + ) + + +def print_rows(rows: list[BenchmarkRow]) -> None: + """Print the benchmark grid in a reviewer-friendly table.""" + + print("=" * 72) + print("PIPELINE + SEARCH + ANSWER-CONTEXT BENCHMARK (Episode 53 fixtures)") + print("=" * 72) + print( + f"{'Min Score':<12} | {'Max Hits':<10} | {'Hit Rate (%)':<15} | " + f"{'MRR':<10} | {'Pipe Hit':<10} | {'Pipe MRR':<10} | " + f"{'Pipe Cnt':<10} | {'Rel Hit':<10} | {'Rel MRR':<10} | " + f"{'Rel Cnt':<10} | {'Ans Sup':<10} | {'NoAns (%)':<10} | {'Sem':<10}" + ) + print("-" * 174) + for row in rows: + pipeline_hit_rate = ( + f"{row.pipeline_metrics.hit_rate:<10.2f}" + if row.pipeline_metrics + else f"{'n/a':<10}" + ) + pipeline_mrr = ( + f"{row.pipeline_metrics.mean_reciprocal_rank:<10.4f}" + if row.pipeline_metrics + else f"{'n/a':<10}" + ) + pipeline_count = ( + f"{row.pipeline_metrics.mean_result_count:<10.2f}" + if row.pipeline_metrics + else f"{'n/a':<10}" + ) + related_hit_rate = ( + f"{row.related_metrics.hit_rate:<10.2f}" + if row.related_metrics + else f"{'n/a':<10}" + ) + related_mrr = ( + f"{row.related_metrics.mean_reciprocal_rank:<10.4f}" + if row.related_metrics + else f"{'n/a':<10}" + ) + related_count = ( + f"{row.related_metrics.mean_result_count:<10.2f}" + if row.related_metrics + else f"{'n/a':<10}" + ) + answer_support = ( + f"{row.answer_metrics.answerable_support:<10.4f}" + if row.answer_metrics + else f"{'n/a':<10}" + ) + no_answer = ( + f"{row.answer_metrics.no_answer_rejection_rate:<10.2f}" + if row.answer_metrics + else f"{'n/a':<10}" + ) + semantic_score = ( + f"{row.answer_metrics.semantic_score:<10.2f}" + if row.answer_metrics + else f"{'n/a':<10}" + ) + print( + f"{row.min_score:<12.2f} | {row.max_hits:<10d} | " + f"{row.metrics.hit_rate:<15.2f} | " + f"{row.metrics.mean_reciprocal_rank:<10.4f} | " + f"{pipeline_hit_rate} | {pipeline_mrr} | {pipeline_count} | " + f"{related_hit_rate} | {related_mrr} | {related_count} | " + f"{answer_support} | {no_answer} | {semantic_score}" + ) + print("-" * 174) + + +async def run_benchmark( + model_spec: str | None, + min_scores: list[float], + max_hits_values: list[int], + batch_size: int, +) -> None: + """Run a single benchmark sweep and print the evaluated grid.""" + + load_dotenv() + + repo_root = Path(__file__).resolve().parent.parent + message_texts = load_message_texts(repo_root) + related_terms = load_related_term_texts(repo_root) + corpus_metadata = load_corpus_metadata(repo_root) + query_cases = load_search_queries(repo_root) + pipeline_query_cases = load_pipeline_queries(repo_root) + related_query_cases = load_related_term_queries(repo_root) + answer_cases = load_answer_queries(repo_root) + if not query_cases: + raise ValueError("No search queries with messageMatches found in the dataset") + model, vector_base = await build_vector_base( + model_spec, + message_texts, + batch_size, + ) + related_vector_base = await build_text_vector_base( + model, + related_terms, + batch_size, + ) + pipeline_conversation = await build_pipeline_conversation(repo_root, model) + query_embeddings = await model.get_embeddings_nocache( + [case.query for case in query_cases] + ) + related_query_embeddings = await model.get_embeddings_nocache( + [case.term for case in related_query_cases] + ) + answer_question_embeddings = await model.get_embeddings_nocache( + [case.question for case in answer_cases] + ) + answer_embeddings = await model.get_embeddings_nocache( + [case.answer for case in answer_cases] + ) + top_score_stats = measure_top_score_stats(vector_base, query_embeddings) + rows = await evaluate_grid( + vector_base, + query_cases, + query_embeddings, + min_scores, + max_hits_values, + pipeline_conversation, + pipeline_query_cases, + related_vector_base, + related_terms, + related_query_cases, + related_query_embeddings, + answer_cases, + answer_question_embeddings, + answer_embeddings, + ) + + print(f"Model: {model.model_name}") + print(f"Messages indexed: {len(message_texts)}") + print(f"Related terms indexed: {len(related_terms)}") + print(f"Queries evaluated: {len(query_cases)}") + print(f"Pipeline query cases evaluated: {len(pipeline_query_cases)}") + print(f"Related-term cases evaluated: {len(related_query_cases)}") + print(f"Answer cases evaluated: {len(answer_cases)}") + print("Pipeline scoring paths:") + for path in PIPELINE_SCORING_PATHS: + print(f" {path}") + if corpus_metadata.serialized_total_embedding_count is not None: + print( + "Serialized sidecar rows ignored: " + f"{corpus_metadata.serialized_total_embedding_count} " + f"({INDEX_EMBEDDINGS_PATH.name})" + ) + elif corpus_metadata.serialized_embedding_size is not None: + print( + "Serialized sidecar metadata found and ignored: " + f"embedding_size={corpus_metadata.serialized_embedding_size}" + ) + print(f"Corpus embeddings: {CORPUS_EMBEDDING_SOURCE}") + print(f"Query embeddings: {QUERY_EMBEDDING_SOURCE}") + print(f"Answer embeddings: {ANSWER_EMBEDDING_SOURCE}") + print( + "Observed top-1 score range: " + f"{top_score_stats.min_top_score:.4f}..{top_score_stats.max_top_score:.4f} " + f"(mean {top_score_stats.mean_top_score:.4f})" + ) + print() + print_rows(rows) + + best_row = select_best_row(rows) + print() + print("Best-scoring benchmark row:") + print(f" min_score={best_row.min_score:.2f}") + print(f" max_hits={best_row.max_hits}") + print(f" hit_rate={best_row.metrics.hit_rate:.2f}%") + print(f" mrr={best_row.metrics.mean_reciprocal_rank:.4f}") + if best_row.pipeline_metrics: + print(f" pipeline_hit_rate={best_row.pipeline_metrics.hit_rate:.2f}%") + print(f" pipeline_mrr={best_row.pipeline_metrics.mean_reciprocal_rank:.4f}") + print( + " pipeline_mean_result_count=" + f"{best_row.pipeline_metrics.mean_result_count:.2f}" + ) + if best_row.related_metrics: + print(f" related_hit_rate={best_row.related_metrics.hit_rate:.2f}%") + print(f" related_mrr={best_row.related_metrics.mean_reciprocal_rank:.4f}") + print( + " related_mean_result_count=" + f"{best_row.related_metrics.mean_result_count:.2f}" + ) + if best_row.answer_metrics: + print(f" answerable_support={best_row.answer_metrics.answerable_support:.4f}") + print( + " no_answer_rejection_rate=" + f"{best_row.answer_metrics.no_answer_rejection_rate:.2f}%" + ) + print(f" semantic_score={best_row.answer_metrics.semantic_score:.2f}") + + +def main() -> None: + """Parse CLI arguments and run the benchmark once.""" + + parser = argparse.ArgumentParser( + description="Benchmark retrieval settings for an embedding model." + ) + parser.add_argument( + "--model", + type=str, + default=None, + help="Provider and model name, e.g. 'openai:text-embedding-3-small'", + ) + parser.add_argument( + "--min-scores", + type=str, + default=None, + help="Comma-separated min_score values to test.", + ) + parser.add_argument( + "--min-score-start", + type=float, + default=None, + help="Inclusive start of a generated min_score range.", + ) + parser.add_argument( + "--min-score-stop", + type=float, + default=None, + help="Inclusive end of a generated min_score range.", + ) + parser.add_argument( + "--min-score-step", + type=float, + default=None, + help="Step size for a generated min_score range.", + ) + parser.add_argument( + "--max-hits", + type=str, + default=None, + help="Comma-separated max_hits values to test.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=16, + help="Batch size used when building the index.", + ) + args = parser.parse_args() + if args.batch_size <= 0: + raise ValueError("--batch-size must be a positive integer") + + asyncio.run( + run_benchmark( + model_spec=args.model, + min_scores=resolve_min_scores( + args.min_scores, + args.min_score_start, + args.min_score_stop, + args.min_score_step, + ), + max_hits_values=parse_int_list(args.max_hits), + batch_size=args.batch_size, + ) + ) + + +if __name__ == "__main__": + main() diff --git a/tools/repeat_embedding_benchmarks.py b/tools/repeat_embedding_benchmarks.py new file mode 100644 index 00000000..5673bca4 --- /dev/null +++ b/tools/repeat_embedding_benchmarks.py @@ -0,0 +1,598 @@ +# Copyright (c) Microsoft Corporation. +# Licensed under the MIT License. + +"""Run embedding benchmarks repeatedly and save raw/summary JSON results. + +This script runs `tools/benchmark_embeddings.py` logic multiple times for each +embedding model, stores every run as JSON, and writes aggregate summaries that +can be used to justify tuned defaults. + +Usage: + uv run python tools/repeat_embedding_benchmarks.py + uv run python tools/repeat_embedding_benchmarks.py --runs 30 + uv run python tools/repeat_embedding_benchmarks.py --models openai:text-embedding-3-small,openai:text-embedding-3-large,openai:text-embedding-ada-002 + uv run python tools/repeat_embedding_benchmarks.py --models openai:text-embedding-3-small --min-score-start 0.01 --min-score-stop 0.20 --min-score-step 0.01 +""" + +import argparse +import asyncio +from dataclasses import asdict, dataclass +from datetime import datetime, UTC +import json +from pathlib import Path +from statistics import mean + +from benchmark_embeddings import ( + ANSWER_EMBEDDING_SOURCE, + BenchmarkRow, + build_pipeline_conversation, + build_text_vector_base, + build_vector_base, + CORPUS_EMBEDDING_SOURCE, + DEFAULT_MAX_HITS, + evaluate_grid, + INDEX_DATA_PATH, + INDEX_EMBEDDINGS_PATH, + load_answer_queries, + load_corpus_metadata, + load_message_texts, + load_pipeline_queries, + load_related_term_queries, + load_related_term_texts, + load_search_queries, + measure_top_score_stats, + parse_int_list, + PIPELINE_SCORING_PATHS, + QUERY_EMBEDDING_SOURCE, + resolve_min_scores, + SEARCH_RESULTS_PATH, + select_best_row, +) +from dotenv import load_dotenv + +DEFAULT_MODELS = [ + "openai:text-embedding-3-small", + "openai:text-embedding-3-large", + "openai:text-embedding-ada-002", +] +DEFAULT_OUTPUT_DIR = Path("benchmark_results") + + +@dataclass +class RunRow: + """Serialized benchmark row for one repeated run.""" + + min_score: float + max_hits: int + hit_rate: float + mean_reciprocal_rank: float + pipeline_hit_rate: float | None + pipeline_mean_reciprocal_rank: float | None + pipeline_mean_result_count: float | None + related_hit_rate: float | None + related_mean_reciprocal_rank: float | None + related_mean_result_count: float | None + answerable_support: float | None + no_answer_rejection_rate: float | None + semantic_score: float | None + + +@dataclass +class RunResult: + """All measurements captured for one benchmark repetition.""" + + run_index: int + model_spec: str + resolved_model_name: str + message_count: int + query_count: int + min_top_score: float + mean_top_score: float + max_top_score: float + rows: list[RunRow] + best_row: RunRow + + +def sanitize_model_name(model_spec: str) -> str: + """Convert a model spec into a filesystem-safe directory name.""" + + return model_spec.replace(":", "__").replace("/", "_").replace("\\", "_") + + +def benchmark_row_to_run_row(row: BenchmarkRow) -> RunRow: + """Flatten a benchmark row into the JSON-friendly repeated-run shape.""" + + return RunRow( + min_score=row.min_score, + max_hits=row.max_hits, + hit_rate=row.metrics.hit_rate, + mean_reciprocal_rank=row.metrics.mean_reciprocal_rank, + pipeline_hit_rate=( + row.pipeline_metrics.hit_rate if row.pipeline_metrics else None + ), + pipeline_mean_reciprocal_rank=( + row.pipeline_metrics.mean_reciprocal_rank if row.pipeline_metrics else None + ), + pipeline_mean_result_count=( + row.pipeline_metrics.mean_result_count if row.pipeline_metrics else None + ), + related_hit_rate=( + row.related_metrics.hit_rate if row.related_metrics else None + ), + related_mean_reciprocal_rank=( + row.related_metrics.mean_reciprocal_rank if row.related_metrics else None + ), + related_mean_result_count=( + row.related_metrics.mean_result_count if row.related_metrics else None + ), + answerable_support=( + row.answer_metrics.answerable_support if row.answer_metrics else None + ), + no_answer_rejection_rate=( + row.answer_metrics.no_answer_rejection_rate if row.answer_metrics else None + ), + semantic_score=( + row.answer_metrics.semantic_score if row.answer_metrics else None + ), + ) + + +def summarize_runs(model_spec: str, runs: list[RunResult]) -> dict[str, object]: + """Average repeated benchmark runs into a per-model summary payload.""" + + summary_rows: dict[tuple[float, int], list[RunRow]] = {} + for run in runs: + for row in run.rows: + summary_rows.setdefault((row.min_score, row.max_hits), []).append(row) + + averaged_rows: list[dict[str, float | int]] = [] + for (min_score, max_hits), rows in sorted(summary_rows.items()): + averaged_rows.append( + { + "min_score": min_score, + "max_hits": max_hits, + "mean_hit_rate": mean(row.hit_rate for row in rows), + "mean_mrr": mean(row.mean_reciprocal_rank for row in rows), + "mean_pipeline_hit_rate": mean( + row.pipeline_hit_rate or 0.0 for row in rows + ), + "mean_pipeline_mrr": mean( + row.pipeline_mean_reciprocal_rank or 0.0 for row in rows + ), + "mean_pipeline_result_count": mean( + row.pipeline_mean_result_count or 0.0 for row in rows + ), + "mean_related_hit_rate": mean( + row.related_hit_rate or 0.0 for row in rows + ), + "mean_related_mrr": mean( + row.related_mean_reciprocal_rank or 0.0 for row in rows + ), + "mean_related_result_count": mean( + row.related_mean_result_count or 0.0 for row in rows + ), + "mean_answerable_support": mean( + row.answerable_support or 0.0 for row in rows + ), + "mean_no_answer_rejection_rate": mean( + row.no_answer_rejection_rate or 0.0 for row in rows + ), + "mean_semantic_score": mean(row.semantic_score or 0.0 for row in rows), + } + ) + + best_rows = [run.best_row for run in runs] + best_min_score_counts: dict[str, int] = {} + best_max_hits_counts: dict[str, int] = {} + for row in best_rows: + best_min_score_counts[f"{row.min_score:.2f}"] = ( + best_min_score_counts.get(f"{row.min_score:.2f}", 0) + 1 + ) + best_max_hits_counts[str(row.max_hits)] = ( + best_max_hits_counts.get(str(row.max_hits), 0) + 1 + ) + + averaged_best_row = max( + averaged_rows, + key=lambda row: ( + float(row["mean_pipeline_mrr"]), + float(row["mean_pipeline_hit_rate"]), + float(row["mean_related_mrr"]), + float(row["mean_related_hit_rate"]), + float(row["mean_mrr"]), + float(row["mean_hit_rate"]), + float(row["min_score"]), + float(row["mean_semantic_score"]), + -int(row["max_hits"]), + ), + ) + + return { + "model_spec": model_spec, + "resolved_model_name": runs[0].resolved_model_name, + "run_count": len(runs), + "message_count": runs[0].message_count, + "query_count": runs[0].query_count, + "min_top_score": mean(run.min_top_score for run in runs), + "mean_top_score": mean(run.mean_top_score for run in runs), + "max_top_score": mean(run.max_top_score for run in runs), + "candidate_rows": averaged_rows, + "recommended_row": averaged_best_row, + "best_min_score_counts": best_min_score_counts, + "best_max_hits_counts": best_max_hits_counts, + } + + +def write_json(path: Path, data: object) -> None: + """Write a JSON artifact with stable indentation for review and reuse.""" + + path.write_text(json.dumps(data, indent=2), encoding="utf-8") + + +def serialize_metadata_path(path: Path) -> str: + """Serialize repository-relative metadata paths consistently across OSes.""" + + return path.as_posix() + + +def write_markdown_summary(path: Path, summaries: list[dict[str, object]]) -> None: + """Write the reviewer-facing markdown summary for all benchmarked models.""" + + lines = [ + "# Repeated Embedding Benchmark Summary", + "", + "Corpus and query embeddings are recomputed for each evaluated model.", + ( + "The serialized benchmark sidecar " + f"`{INDEX_EMBEDDINGS_PATH.name}` is ignored." + ), + "", + "| Model | Runs | Recommended min_score | Recommended max_hits | Pipeline hit rate | Pipeline MRR | Related hit rate | Related MRR | Mean hit rate | Mean MRR | Answer support | No-answer rejection | Semantic score |", + "| --- | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: | ---: |", + ] + for summary in summaries: + recommended_row = summary["recommended_row"] + assert isinstance(recommended_row, dict) + lines.append( + "| " + f"{summary['resolved_model_name']} | " + f"{summary['run_count']} | " + f"{recommended_row['min_score']:.2f} | " + f"{recommended_row['max_hits']} | " + f"{recommended_row['mean_pipeline_hit_rate']:.2f} | " + f"{recommended_row['mean_pipeline_mrr']:.4f} | " + f"{recommended_row['mean_related_hit_rate']:.2f} | " + f"{recommended_row['mean_related_mrr']:.4f} | " + f"{recommended_row['mean_hit_rate']:.2f} | " + f"{recommended_row['mean_mrr']:.4f} | " + f"{recommended_row['mean_answerable_support']:.4f} | " + f"{recommended_row['mean_no_answer_rejection_rate']:.2f} | " + f"{recommended_row['mean_semantic_score']:.2f} |" + ) + lines.append("") + for summary in summaries: + lines.append( + f"- {summary['resolved_model_name']}: observed top-1 score range " + f"{summary['min_top_score']:.4f}..{summary['max_top_score']:.4f} " + f"(mean {summary['mean_top_score']:.4f})." + ) + lines.append("") + path.write_text("\n".join(lines), encoding="utf-8") + + +def build_run_suite_metadata( + repo_root: Path, + timestamp: str, + models: list[str], + runs: int, + min_scores: list[float], + max_hits_values: list[int], + batch_size: int, +) -> dict[str, object]: + """Build the shared metadata payload for one repeated benchmark suite.""" + + corpus_metadata = load_corpus_metadata(repo_root) + return { + "created_at_utc": timestamp, + "runs_per_model": runs, + "models": models, + "message_source_json": serialize_metadata_path(INDEX_DATA_PATH), + "pipeline_source_json": serialize_metadata_path(SEARCH_RESULTS_PATH), + "related_term_source_json": serialize_metadata_path(SEARCH_RESULTS_PATH), + "pipeline_scoring_paths": PIPELINE_SCORING_PATHS, + "ignored_serialized_embedding_sidecar": serialize_metadata_path( + INDEX_EMBEDDINGS_PATH + ), + "ignored_serialized_embedding_size": ( + corpus_metadata.serialized_embedding_size + ), + "ignored_serialized_message_embedding_count": ( + corpus_metadata.serialized_message_count + ), + "ignored_serialized_related_embedding_count": ( + corpus_metadata.serialized_related_count + ), + "ignored_serialized_total_embedding_count": ( + corpus_metadata.serialized_total_embedding_count + ), + "corpus_embedding_source": CORPUS_EMBEDDING_SOURCE, + "query_embedding_source": QUERY_EMBEDDING_SOURCE, + "answer_embedding_source": ANSWER_EMBEDDING_SOURCE, + "min_scores": min_scores, + "max_hits_values": max_hits_values, + "min_score_count": len(min_scores), + "max_hits_count": len(max_hits_values), + "grid_row_count": len(min_scores) * len(max_hits_values), + "batch_size": batch_size, + } + + +async def run_single_model_benchmark( + model_spec: str, + runs: int, + min_scores: list[float], + max_hits_values: list[int], + batch_size: int, + output_dir: Path, +) -> dict[str, object]: + """Run the benchmark repeatedly for one model and persist raw artifacts.""" + + repo_root = Path(__file__).resolve().parent.parent + message_texts = load_message_texts(repo_root) + related_terms = load_related_term_texts(repo_root) + query_cases = load_search_queries(repo_root) + pipeline_query_cases = load_pipeline_queries(repo_root) + related_query_cases = load_related_term_queries(repo_root) + answer_cases = load_answer_queries(repo_root) + model_output_dir = output_dir / sanitize_model_name(model_spec) + model_output_dir.mkdir(parents=True, exist_ok=True) + grid_row_count = len(min_scores) * len(max_hits_values) + print( + " Loaded fixtures: " + f"{len(message_texts)} messages, " + f"{len(query_cases)} direct queries, " + f"{len(pipeline_query_cases)} pipeline queries, " + f"{len(related_query_cases)} related-term queries, " + f"{len(answer_cases)} answer queries.", + flush=True, + ) + print( + f" Preparing {model_spec} indexes and embeddings once " + f"for {grid_row_count} grid rows " + f"({len(min_scores)} min_score values x " + f"{len(max_hits_values)} max_hits values)...", + flush=True, + ) + model, vector_base = await build_vector_base( + model_spec, + message_texts, + batch_size, + ) + print(" Message vector index ready.", flush=True) + related_vector_base = await build_text_vector_base( + model, + related_terms, + batch_size, + ) + print(" Related-term vector index ready.", flush=True) + pipeline_conversation = await build_pipeline_conversation( + repo_root, + model, + ) + print(" True pipeline conversation indexes ready.", flush=True) + query_embeddings = await model.get_embeddings_nocache( + [case.query for case in query_cases] + ) + related_query_embeddings = await model.get_embeddings_nocache( + [case.term for case in related_query_cases] + ) + answer_question_embeddings = await model.get_embeddings_nocache( + [case.question for case in answer_cases] + ) + answer_embeddings = await model.get_embeddings_nocache( + [case.answer for case in answer_cases] + ) + top_score_stats = measure_top_score_stats( + vector_base, + query_embeddings, + ) + print( + " Direct query-to-message diagnostic ready " + f"(best-match score range {top_score_stats.min_top_score:.4f}.." + f"{top_score_stats.max_top_score:.4f}; not used to cap the sweep).", + flush=True, + ) + + run_results: list[RunResult] = [] + for run_index in range(1, runs + 1): + print( + f" Run {run_index}/{runs}: evaluating {grid_row_count} grid rows...", + flush=True, + ) + benchmark_rows = await evaluate_grid( + vector_base, + query_cases, + query_embeddings, + min_scores, + max_hits_values, + pipeline_conversation, + pipeline_query_cases, + related_vector_base, + related_terms, + related_query_cases, + related_query_embeddings, + answer_cases, + answer_question_embeddings, + answer_embeddings, + progress_label=f" Run {run_index}/{runs}", + ) + + best_row = select_best_row(benchmark_rows) + print( + " Run " + f"{run_index}/{runs}: best min_score={best_row.min_score:.2f}, " + f"max_hits={best_row.max_hits}.", + flush=True, + ) + run_result = RunResult( + run_index=run_index, + model_spec=model_spec, + resolved_model_name=model.model_name, + message_count=len(message_texts), + query_count=len(query_cases), + min_top_score=top_score_stats.min_top_score, + mean_top_score=top_score_stats.mean_top_score, + max_top_score=top_score_stats.max_top_score, + rows=[benchmark_row_to_run_row(row) for row in benchmark_rows], + best_row=benchmark_row_to_run_row(best_row), + ) + run_results.append(run_result) + write_json(model_output_dir / f"run_{run_index:02d}.json", asdict(run_result)) + + summary = summarize_runs(model_spec, run_results) + write_json(model_output_dir / "summary.json", summary) + return summary + + +async def run_repeated_benchmarks( + models: list[str], + runs: int, + min_scores: list[float], + max_hits_values: list[int], + batch_size: int, + output_root: Path, +) -> Path: + """Run the benchmark suite for each requested model and save the artifacts.""" + + timestamp = datetime.now(UTC).strftime("%Y%m%dT%H%M%SZ") + output_dir = output_root / timestamp + output_dir.mkdir(parents=True, exist_ok=True) + repo_root = Path(__file__).resolve().parent.parent + metadata = build_run_suite_metadata( + repo_root=repo_root, + timestamp=timestamp, + models=models, + runs=runs, + min_scores=min_scores, + max_hits_values=max_hits_values, + batch_size=batch_size, + ) + write_json(output_dir / "metadata.json", metadata) + + summaries: list[dict[str, object]] = [] + for model_spec in models: + print(f"Running {runs} benchmark iterations for {model_spec}...") + summary = await run_single_model_benchmark( + model_spec=model_spec, + runs=runs, + min_scores=min_scores, + max_hits_values=max_hits_values, + batch_size=batch_size, + output_dir=output_dir, + ) + summaries.append(summary) + + write_json(output_dir / "summary.json", summaries) + write_markdown_summary(output_dir / "summary.md", summaries) + return output_dir + + +def parse_models(raw: str | None) -> list[str]: + """Parse the model list or fall back to the built-in OpenAI benchmark set.""" + + if raw is None: + return DEFAULT_MODELS + models = [item.strip() for item in raw.split(",") if item.strip()] + if not models: + raise ValueError("--models must contain at least one model") + return models + + +def main() -> None: + """Parse CLI arguments and run repeated embedding benchmarks.""" + + parser = argparse.ArgumentParser( + description="Run embedding benchmarks repeatedly and save JSON results." + ) + parser.add_argument( + "--models", + type=str, + default=None, + help="Comma-separated model specs to benchmark.", + ) + parser.add_argument( + "--runs", + type=int, + default=30, + help="Number of repeated runs per model.", + ) + parser.add_argument( + "--min-scores", + type=str, + default=None, + help="Comma-separated min_score values to test.", + ) + parser.add_argument( + "--min-score-start", + type=float, + default=None, + help="Inclusive start of a generated min_score range.", + ) + parser.add_argument( + "--min-score-stop", + type=float, + default=None, + help="Inclusive end of a generated min_score range.", + ) + parser.add_argument( + "--min-score-step", + type=float, + default=None, + help="Step size for a generated min_score range.", + ) + parser.add_argument( + "--max-hits", + type=str, + default=",".join(str(value) for value in DEFAULT_MAX_HITS), + help="Comma-separated max_hits values to test.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=16, + help="Batch size used when building the index.", + ) + parser.add_argument( + "--output-dir", + type=str, + default=str(DEFAULT_OUTPUT_DIR), + help="Directory where benchmark results will be written.", + ) + args = parser.parse_args() + + if args.runs <= 0: + raise ValueError("--runs must be a positive integer") + if args.batch_size <= 0: + raise ValueError("--batch-size must be a positive integer") + + load_dotenv() + output_dir = asyncio.run( + run_repeated_benchmarks( + models=parse_models(args.models), + runs=args.runs, + min_scores=resolve_min_scores( + args.min_scores, + args.min_score_start, + args.min_score_stop, + args.min_score_step, + ), + max_hits_values=parse_int_list(args.max_hits), + batch_size=args.batch_size, + output_root=Path(args.output_dir), + ) + ) + print(f"Wrote benchmark results to {output_dir}") + + +if __name__ == "__main__": + main()