microsoft · shreejaykurhade · Apr 9, 2026 · Apr 9, 2026 · Apr 11, 2026 · Apr 21, 2026
diff --git a/pyproject.toml b/pyproject.toml
@@ -63,6 +63,7 @@ asyncio_default_fixture_loop_scope = "function"
 testpaths = ["tests"]
 
 [tool.pyright]
+extraPaths = ["src", "tools"]
 reportUnusedVariable = true
 reportUnusedImport = true
 reportDuplicateImport = true

diff --git a/src/typeagent/aitools/vectorbase.py b/src/typeagent/aitools/vectorbase.py
@@ -13,15 +13,52 @@
 )
 from .model_adapters import create_embedding_model
 
+DEFAULT_MIN_SCORE = 0.85
+
+# Empirical defaults for built-in OpenAI embedding models.
+# These values come from the Adrian Tchaikovsky Episode 53
+# search benchmark in `tools/repeat_embedding_benchmarks.py`, using an
+# exhaustive 0.00..1.00 min_score sweep. The benchmark recomputes corpus and
+# query embeddings for each model and ignores the fixture's serialized
+# embedding sidecar. Scores are normalized from cosine similarity to the public
+# 0..1 min_score scale.
+# These are repository defaults for known models, not universal truths.
+# Unknown models keep the long-standing fallback score of 0.85. Callers can
+# always override `min_score` explicitly for their own use cases or models. We
+# intentionally leave `max_matches` out of this table: the benchmark still
+# reports a best `max_hits` row, but the library default remains `None` unless
+# a caller opts into a specific limit.
+MODEL_DEFAULT_MIN_SCORES: dict[str, float] = {
+    "text-embedding-3-large": 0.74,
+    "text-embedding-3-small": 0.73,
+    "text-embedding-ada-002": 0.93,
+}
+
+
+def get_default_min_score(model_name: str) -> float:
+    """Return the repository default score cutoff for a known model name."""
+
+    return MODEL_DEFAULT_MIN_SCORES.get(model_name, DEFAULT_MIN_SCORE)
+
+
+def cosine_to_score(cosine_similarity: np.ndarray) -> np.ndarray:
+    """Map cosine similarity from -1..1 to the public 0..1 score scale."""
+
+    return np.clip((cosine_similarity + 1.0) / 2.0, 0.0, 1.0)
+
 
 @dataclass
 class ScoredInt:
+    """Associate an integer ordinal with its similarity score."""
+
     item: int
     score: float
 
 
 @dataclass
 class TextEmbeddingIndexSettings:
+    """Runtime settings for embedding-backed fuzzy lookup."""
+
     embedding_model: IEmbeddingModel
     min_score: float  # Between 0.0 and 1.0
     max_matches: int | None  # >= 1; None means no limit
@@ -34,10 +71,12 @@ def __init__(
         max_matches: int | None = None,
         batch_size: int | None = None,
     ):
-        self.min_score = min_score if min_score is not None else 0.85
+        self.embedding_model = embedding_model or create_embedding_model()
+        model_name = getattr(self.embedding_model, "model_name", "")
+        default_min_score = get_default_min_score(model_name)
+        self.min_score = min_score if min_score is not None else default_min_score
         self.max_matches = max_matches if max_matches and max_matches >= 1 else None
         self.batch_size = batch_size if batch_size and batch_size >= 1 else 8
-        self.embedding_model = embedding_model or create_embedding_model()
 
 
 class VectorBase:
@@ -76,20 +115,19 @@ def __bool__(self) -> bool:
     def add_embedding(
         self, key: str | None, embedding: NormalizedEmbedding | list[float]
     ) -> None:
-        if isinstance(embedding, list):
-            embedding = np.array(embedding, dtype=np.float32)
+        embedding_array = np.asarray(embedding, dtype=np.float32)
         if self._embedding_size == 0:
-            self._set_embedding_size(len(embedding))
+            self._set_embedding_size(len(embedding_array))
             self._vectors.shape = (0, self._embedding_size)
-        if len(embedding) != self._embedding_size:
+        if len(embedding_array) != self._embedding_size:
             raise ValueError(
                 f"Embedding size mismatch: expected {self._embedding_size}, "
-                f"got {len(embedding)}"
+                f"got {len(embedding_array)}"
             )
-        embeddings = embedding.reshape(1, -1)  # Make it 2D: 1xN
+        embeddings = embedding_array.reshape(1, -1)  # Make it 2D: 1xN
         self._vectors = np.append(self._vectors, embeddings, axis=0)
         if key is not None:
-            self._model.add_embedding(key, embedding)
+            self._model.add_embedding(key, embedding_array)
 
     def add_embeddings(
         self, keys: None | list[str], embeddings: NormalizedEmbeddings
@@ -132,7 +170,7 @@ def fuzzy_lookup_embedding(
             min_score = 0.0
         if len(self._vectors) == 0:
             return []
-        scores = np.dot(self._vectors, embedding)
+        scores = cosine_to_score(np.dot(self._vectors, embedding))
         if predicate is None:
             # Stay in numpy: filter by score, then top-k via argpartition.
             indices = np.flatnonzero(scores >= min_score)
@@ -174,7 +212,7 @@ def fuzzy_lookup_embedding_in_subset(
             return []
         # Compute dot products only for the subset instead of all vectors.
         subset = np.asarray(ordinals_of_subset)
-        scores = np.dot(self._vectors[subset], embedding)
+        scores = cosine_to_score(np.dot(self._vectors[subset], embedding))
         indices = np.flatnonzero(scores >= min_score)
         if len(indices) == 0:
             return []
@@ -235,7 +273,7 @@ def deserialize(self, data: NormalizedEmbeddings | None) -> None:
             return
         if self._embedding_size == 0:
             if data.ndim < 2 or data.shape[0] == 0:
-                # Empty data — can't determine size; just clear.
+                # Empty data can't determine size; just clear.
                 self.clear()
                 return
             self._set_embedding_size(data.shape[1])

diff --git a/src/typeagent/knowpro/convsettings.py b/src/typeagent/knowpro/convsettings.py
@@ -10,6 +10,9 @@
 from ..aitools.vectorbase import TextEmbeddingIndexSettings
 from .interfaces import IKnowledgeExtractor, IStorageProvider
 
+DEFAULT_RELATED_TERM_MIN_SCORE = 0.85
+DEFAULT_MESSAGE_TEXT_MIN_SCORE = 0.7
+
 
 @dataclass
 class MessageTextIndexSettings:
@@ -45,13 +48,13 @@ def __init__(
         # All settings share the same model, so they share the embedding cache.
         model = model or create_embedding_model()
         self.embedding_model = model
-        min_score = 0.85
+        min_score = DEFAULT_RELATED_TERM_MIN_SCORE
         self.related_term_index_settings = RelatedTermIndexSettings(
             TextEmbeddingIndexSettings(model, min_score=min_score, max_matches=50)
         )
         self.thread_settings = TextEmbeddingIndexSettings(model, min_score=min_score)
         self.message_text_index_settings = MessageTextIndexSettings(
-            TextEmbeddingIndexSettings(model, min_score=0.7)
+            TextEmbeddingIndexSettings(model, min_score=DEFAULT_MESSAGE_TEXT_MIN_SCORE)
         )
         self.semantic_ref_index_settings = SemanticRefIndexSettings(
             batch_size=4,  # Effectively max concurrency