Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ asyncio_default_fixture_loop_scope = "function"
testpaths = ["tests"]

[tool.pyright]
extraPaths = ["src", "tools"]
reportUnusedVariable = true
reportUnusedImport = true
reportDuplicateImport = true
Expand Down
62 changes: 50 additions & 12 deletions src/typeagent/aitools/vectorbase.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,15 +13,52 @@
)
from .model_adapters import create_embedding_model

DEFAULT_MIN_SCORE = 0.85

# Empirical defaults for built-in OpenAI embedding models.
# These values come from the Adrian Tchaikovsky Episode 53
# search benchmark in `tools/repeat_embedding_benchmarks.py`, using an
# exhaustive 0.00..1.00 min_score sweep. The benchmark recomputes corpus and
# query embeddings for each model and ignores the fixture's serialized
# embedding sidecar. Scores are normalized from cosine similarity to the public
# 0..1 min_score scale.
# These are repository defaults for known models, not universal truths.
# Unknown models keep the long-standing fallback score of 0.85. Callers can
# always override `min_score` explicitly for their own use cases or models. We
# intentionally leave `max_matches` out of this table: the benchmark still
# reports a best `max_hits` row, but the library default remains `None` unless
# a caller opts into a specific limit.
MODEL_DEFAULT_MIN_SCORES: dict[str, float] = {
"text-embedding-3-large": 0.74,
"text-embedding-3-small": 0.73,
"text-embedding-ada-002": 0.93,
}


def get_default_min_score(model_name: str) -> float:
"""Return the repository default score cutoff for a known model name."""

return MODEL_DEFAULT_MIN_SCORES.get(model_name, DEFAULT_MIN_SCORE)


def cosine_to_score(cosine_similarity: np.ndarray) -> np.ndarray:
"""Map cosine similarity from -1..1 to the public 0..1 score scale."""

return np.clip((cosine_similarity + 1.0) / 2.0, 0.0, 1.0)


@dataclass
class ScoredInt:
"""Associate an integer ordinal with its similarity score."""

item: int
score: float


@dataclass
class TextEmbeddingIndexSettings:
"""Runtime settings for embedding-backed fuzzy lookup."""

embedding_model: IEmbeddingModel
min_score: float # Between 0.0 and 1.0
max_matches: int | None # >= 1; None means no limit
Expand All @@ -34,10 +71,12 @@ def __init__(
max_matches: int | None = None,
batch_size: int | None = None,
):
self.min_score = min_score if min_score is not None else 0.85
self.embedding_model = embedding_model or create_embedding_model()
model_name = getattr(self.embedding_model, "model_name", "")
default_min_score = get_default_min_score(model_name)
self.min_score = min_score if min_score is not None else default_min_score
self.max_matches = max_matches if max_matches and max_matches >= 1 else None
self.batch_size = batch_size if batch_size and batch_size >= 1 else 8
self.embedding_model = embedding_model or create_embedding_model()


class VectorBase:
Expand Down Expand Up @@ -76,20 +115,19 @@ def __bool__(self) -> bool:
def add_embedding(
self, key: str | None, embedding: NormalizedEmbedding | list[float]
) -> None:
if isinstance(embedding, list):
embedding = np.array(embedding, dtype=np.float32)
embedding_array = np.asarray(embedding, dtype=np.float32)
if self._embedding_size == 0:
self._set_embedding_size(len(embedding))
self._set_embedding_size(len(embedding_array))
self._vectors.shape = (0, self._embedding_size)
if len(embedding) != self._embedding_size:
if len(embedding_array) != self._embedding_size:
raise ValueError(
f"Embedding size mismatch: expected {self._embedding_size}, "
f"got {len(embedding)}"
f"got {len(embedding_array)}"
)
embeddings = embedding.reshape(1, -1) # Make it 2D: 1xN
embeddings = embedding_array.reshape(1, -1) # Make it 2D: 1xN
self._vectors = np.append(self._vectors, embeddings, axis=0)
if key is not None:
self._model.add_embedding(key, embedding)
self._model.add_embedding(key, embedding_array)

def add_embeddings(
self, keys: None | list[str], embeddings: NormalizedEmbeddings
Expand Down Expand Up @@ -132,7 +170,7 @@ def fuzzy_lookup_embedding(
min_score = 0.0
if len(self._vectors) == 0:
return []
scores = np.dot(self._vectors, embedding)
scores = cosine_to_score(np.dot(self._vectors, embedding))
if predicate is None:
# Stay in numpy: filter by score, then top-k via argpartition.
indices = np.flatnonzero(scores >= min_score)
Expand Down Expand Up @@ -174,7 +212,7 @@ def fuzzy_lookup_embedding_in_subset(
return []
# Compute dot products only for the subset instead of all vectors.
subset = np.asarray(ordinals_of_subset)
scores = np.dot(self._vectors[subset], embedding)
scores = cosine_to_score(np.dot(self._vectors[subset], embedding))
indices = np.flatnonzero(scores >= min_score)
if len(indices) == 0:
return []
Expand Down Expand Up @@ -235,7 +273,7 @@ def deserialize(self, data: NormalizedEmbeddings | None) -> None:
return
if self._embedding_size == 0:
if data.ndim < 2 or data.shape[0] == 0:
# Empty data can't determine size; just clear.
# Empty data can't determine size; just clear.
self.clear()
return
self._set_embedding_size(data.shape[1])
Expand Down
7 changes: 5 additions & 2 deletions src/typeagent/knowpro/convsettings.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
from ..aitools.vectorbase import TextEmbeddingIndexSettings
from .interfaces import IKnowledgeExtractor, IStorageProvider

DEFAULT_RELATED_TERM_MIN_SCORE = 0.85
DEFAULT_MESSAGE_TEXT_MIN_SCORE = 0.7


@dataclass
class MessageTextIndexSettings:
Expand Down Expand Up @@ -45,13 +48,13 @@ def __init__(
# All settings share the same model, so they share the embedding cache.
model = model or create_embedding_model()
self.embedding_model = model
min_score = 0.85
min_score = DEFAULT_RELATED_TERM_MIN_SCORE
self.related_term_index_settings = RelatedTermIndexSettings(
TextEmbeddingIndexSettings(model, min_score=min_score, max_matches=50)
)
self.thread_settings = TextEmbeddingIndexSettings(model, min_score=min_score)
self.message_text_index_settings = MessageTextIndexSettings(
TextEmbeddingIndexSettings(model, min_score=0.7)
TextEmbeddingIndexSettings(model, min_score=DEFAULT_MESSAGE_TEXT_MIN_SCORE)
)
self.semantic_ref_index_settings = SemanticRefIndexSettings(
batch_size=4, # Effectively max concurrency
Expand Down
Loading