diff --git a/Cargo.lock b/Cargo.lock index e5bc60d..9e393c3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1065,6 +1065,7 @@ name = "index" version = "0.1.0" dependencies = [ "defs", + "rand", "uuid", ] diff --git a/crates/api/src/lib.rs b/crates/api/src/lib.rs index f190682..db83005 100644 --- a/crates/api/src/lib.rs +++ b/crates/api/src/lib.rs @@ -1,4 +1,4 @@ -use defs::{DbError, IndexedVector, Similarity}; +use defs::{DbError, Dimension, IndexedVector, Similarity}; use defs::{DenseVector, Payload, Point, PointId}; use std::path::PathBuf; @@ -135,7 +135,7 @@ pub struct DbConfig { pub storage_type: StorageType, pub index_type: IndexType, pub data_path: PathBuf, - pub dimension: usize, + pub dimension: Dimension, } pub fn init_api(config: DbConfig) -> Result { diff --git a/crates/defs/src/error.rs b/crates/defs/src/error.rs index 3cbdac9..8da2d4a 100644 --- a/crates/defs/src/error.rs +++ b/crates/defs/src/error.rs @@ -1,4 +1,6 @@ use std::io; + +use crate::Dimension; #[derive(Debug, PartialEq, Eq)] pub enum DbError { ParseError, @@ -8,6 +10,7 @@ pub enum DbError { IndexError(String), LockError, DimensionMismatch, + InvalidDimension { expected: Dimension, got: Dimension }, } #[derive(Debug)] diff --git a/crates/defs/src/types.rs b/crates/defs/src/types.rs index ae69f17..7a6430f 100644 --- a/crates/defs/src/types.rs +++ b/crates/defs/src/types.rs @@ -9,6 +9,8 @@ pub type Element = f32; // pub type ElementHalf = f16; - Unstable https://github.com/rust-lang/rust/issues/116909 pub type ElementByte = u8; +pub type Dimension = usize; + // Dense Vector and Vector are considered same // Sparse vector implementation not supported yet. Refer lib/sparse/src/common/sparse_vector.rs pub type DenseVector = Vec; @@ -89,3 +91,29 @@ impl<'q> Eq for DistanceOrderedVector<'q> {} // Discovery(DiscoveryQuery), // Context(ContextQuery), // } + +#[derive(Clone, Copy, Debug, PartialEq)] +pub struct OrdF32(f32); + +impl OrdF32 { + pub fn new(x: f32) -> Self { + Self(x) + } + pub fn into_inner(self) -> f32 { + self.0 + } +} + +impl Eq for OrdF32 {} + +impl Ord for OrdF32 { + fn cmp(&self, other: &Self) -> Ordering { + self.0.total_cmp(&other.0) + } +} + +impl PartialOrd for OrdF32 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} diff --git a/crates/index/Cargo.toml b/crates/index/Cargo.toml index 7643ba3..dc33e81 100644 --- a/crates/index/Cargo.toml +++ b/crates/index/Cargo.toml @@ -8,4 +8,5 @@ edition = "2021" [dependencies] defs = { path = "../defs" } -uuid.workspace = true \ No newline at end of file +rand = "0.9.2" +uuid.workspace = true diff --git a/crates/index/src/hnsw.rs b/crates/index/src/hnsw.rs deleted file mode 100644 index a996e3a..0000000 --- a/crates/index/src/hnsw.rs +++ /dev/null @@ -1,29 +0,0 @@ -use crate::VectorIndex; - -pub struct HnswIndex { - -} - -impl HnswIndex { - pub fn new() -> Self { - Self {} - } -} - -impl VectorIndex for HnswIndex { - fn insert(&self, vector: &[f32]) -> Result<(), Error> { - Ok(()) - } - - fn delete(&self, vector: &[f32]) -> Result<(), Error> { - Ok(()) - } - - fn search(&self, vector: &[f32]) -> Result, Error> { - Ok(vec![]) - } - - fn build(&self) -> Result<(), Error> { - Ok(()) - } -} \ No newline at end of file diff --git a/crates/index/src/hnsw/index.rs b/crates/index/src/hnsw/index.rs new file mode 100644 index 0000000..7f9caf3 --- /dev/null +++ b/crates/index/src/hnsw/index.rs @@ -0,0 +1,300 @@ +use std::collections::HashMap; + +use defs::{DbError, DenseVector, Dimension, IndexedVector, PointId, Similarity}; +use uuid::Uuid; + +use crate::VectorIndex; + +use super::types::{HnswStats, LevelGenerator, Node, PointIndexation}; +use std::cmp::{max, min}; + +pub struct HnswIndex { + // Construction/search parameters + pub ef_construction: usize, + // Layered point storage and entry point + pub index: PointIndexation, + // Cached dimension of stored vectors + pub data_dimension: Dimension, + // Guard against concurrent mutation during queries + pub searching: bool, + // Default query beam width (ef); recommended ef ≥ k at query time + pub ef: usize, + // In-memory vector cache owned by the index + cache: HashMap, + // Fixed metric for this index; used consistently in insert and search + pub similarity: Similarity, +} + +impl HnswIndex { + pub fn new(similarity: Similarity, data_dimension: Dimension) -> Self { + let max_connections = 16; + let max_connections_0 = 32; // M0 = 2 * M (common default) + let max_layer = 16; + let ef_construction = 200; + let ef = 100; + + let level_generator = LevelGenerator::from_m(max_connections); + let index = PointIndexation { + max_connections, + max_connections_0, + max_layer, + points_by_layer: vec![Vec::new(); max_layer], + nodes: HashMap::new(), + nb_point: 0, + entry_point: None, + level_generator, + }; + + Self { + ef_construction, + index, + data_dimension, + searching: false, + ef, + cache: HashMap::new(), + similarity, + } + } + + /// Returns a slice of the stored vector for the given PointId. + /// TODO: integrate this cache with an in-memory store backed by RocksDB; on cache miss, + /// fetch from storage, populate the cache, and return a stable slice. + pub(super) fn get_vec(&self, id: PointId) -> &[f32] { + let v = self + .cache + .get(&id) + .unwrap_or_else(|| panic!("Vector not found in HNSW cache for id={id}")); + v.as_slice() + } +} + +impl VectorIndex for HnswIndex { + /// Insert a new point + /// - sample a random level for the new node + /// - if empty, set entry point to the new id and return + /// - greedy descend from current entry to l+1 to get a pivot + /// - for each level down to 0: ef-construction, diversity pruning, bidirectional connect with caps + /// - if l is above current max level, update the entry point + fn insert(&mut self, vector: IndexedVector) -> Result<(), DbError> { + let dim = vector.vector.len(); + if dim != self.data_dimension { + return Err(DbError::InvalidDimension { + expected: self.data_dimension, + got: dim, + }); + } + + let new_id: PointId = vector.id; + + let mut query_vec = vector.vector.clone(); + self.normalize_if_cosine(&mut query_vec); + + self.cache.insert(new_id, query_vec.clone()); + + let mut rng = rand::rng(); + let l: u8 = self + .index + .level_generator + .sample_level(&mut rng, self.index.max_layer); + + let node = Node { + id: new_id, + level: l, + neighbors: vec![vec![]; (l as usize) + 1], + deleted: false, + }; + + let needed_layers = (l as usize) + 1; + if self.index.points_by_layer.len() < needed_layers { + self.index.points_by_layer.resize(needed_layers, Vec::new()); + } + self.index.nodes.insert(new_id, node); + for layer in 0..=l as usize { + self.index.points_by_layer[layer].push(new_id); + } + + if self.index.entry_point.is_none() { + self.index.entry_point = Some(new_id); + return Ok(()); + } + + let mut ep = self.index.entry_point.unwrap(); + let current_max_level = self + .index + .nodes + .get(&ep) + .map(|n| n.level as usize) + .unwrap_or(0); + if current_max_level > (l as usize) { + for level in ((l as usize + 1)..=current_max_level).rev() { + ep = self.greedy_search_layer(ep, level, &query_vec); + } + } + + for level in (0..=min(l as usize, current_max_level)).rev() { + let w = self.search_layer_for_insert(ep, level, &query_vec, self.ef_construction); + + let m_level = if level == 0 { + self.index.max_connections_0 + } else { + self.index.max_connections + }; + let chosen = self.select_neighbors_heuristic(&w, m_level); + self.connect_bidirectional(new_id, &chosen, level, m_level); + if let Some((closest_id, _)) = w.first() { + ep = *closest_id; + } + } + if (l as usize) > current_max_level { + self.index.entry_point = Some(new_id); + } + + Ok(()) + } + + /// Delete a point (soft) + /// - mark node as deleted and clear its cached vector + /// - traversals skip deleted nodes + /// - if entry point was deleted, move it to the highest-level non-deleted node (or None) + fn delete(&mut self, point_id: PointId) -> Result { + if let Some(node) = self.index.nodes.get_mut(&point_id) { + if node.deleted { + return Ok(false); + } + node.deleted = true; + self.cache.remove(&point_id); + if self.index.entry_point == Some(point_id) { + self.index.entry_point = self.pick_entry(); + } + return Ok(true); + } + Ok(false) + } + /// Search for top-k ids + /// - normalize query for cosine (1 − cos) + /// - pick a non-deleted entry point + /// - greedy descend from the top layer to level 1 + /// - run ef-best-first at level 0 with ef0 = max(ef, k) + /// - return up to k ids by ascending distance + fn search( + &self, + mut query: DenseVector, + _similarity: Similarity, + k: usize, + ) -> Result, DbError> { + if k == 0 { + return Ok(Vec::new()); + } + + if query.len() != self.data_dimension { + return Err(DbError::InvalidDimension { + expected: self.data_dimension, + got: query.len(), + }); + } + + let entry = match self.pick_entry() { + Some(id) => id, + None => return Ok(Vec::new()), + }; + + self.normalize_if_cosine(&mut query); + + let mut ep = entry; + let current_max_level = self + .index + .nodes + .get(&entry) + .map(|n| n.level as usize) + .unwrap_or(0); + if current_max_level > 0 { + for level in (1..=current_max_level).rev() { + ep = self.greedy_search_layer(ep, level, &query); + } + } + let ef0 = max(self.ef, k); + let mut w = self.search_layer_for_insert(ep, 0, &query, ef0); + w.truncate(k); + let result: Vec = w.into_iter().map(|(id, _)| id).collect(); + Ok(result) + } +} + +impl HnswIndex { + /// Full rebuild from surviving (non-deleted) vectors currently in-memory. + /// Gathers all non-deleted vectors from the cache, clears the graph, and reinserts. + pub fn rebuild_full(&mut self) -> Result<(), DbError> { + let ids: Vec = self + .index + .nodes + .iter() + .filter(|(_, n)| !n.deleted) + .map(|(id, _)| *id) + .collect(); + let mut points: Vec = Vec::with_capacity(ids.len()); + for id in ids { + if let Some(vec) = self.cache.get(&id) { + points.push(IndexedVector { + id, + vector: vec.clone(), + }); + } else { + continue; + } + } + self.index.nodes.clear(); + for layer in &mut self.index.points_by_layer { + layer.clear(); + } + self.index.nb_point = 0; + self.index.entry_point = None; + self.cache.clear(); + for iv in points { + self.insert(iv)?; + } + Ok(()) + } + + /// Fraction of deleted nodes among all nodes (0.0 if no nodes) + pub fn deleted_ratio(&self) -> f32 { + let total = self.index.nodes.len(); + if total == 0 { + return 0.0; + } + let deleted = self.index.nodes.values().filter(|n| n.deleted).count(); + deleted as f32 / total as f32 + } + + /// alive/deleted counts and level histogram for alive nodes + pub fn stats(&self) -> HnswStats { + let mut alive = 0usize; + let mut deleted = 0usize; + let mut hist: std::collections::BTreeMap = std::collections::BTreeMap::new(); + for n in self.index.nodes.values() { + if n.deleted { + deleted += 1; + } else { + alive += 1; + *hist.entry(n.level).or_insert(0) += 1; + } + } + HnswStats { + alive, + deleted, + level_histogram: hist, + } + } + + /// Normalize vector in-place if cosine similarity is used + fn normalize_if_cosine(&self, v: &mut [f32]) { + if let Similarity::Cosine = self.similarity { + let sum_sq: f32 = v.iter().map(|&x| x * x).sum(); + let norm = sum_sq.sqrt(); + if norm > 0.0 { + for x in v { + *x /= norm; + } + } + } + } +} diff --git a/crates/index/src/hnsw/mod.rs b/crates/index/src/hnsw/mod.rs new file mode 100644 index 0000000..dfbc7ae --- /dev/null +++ b/crates/index/src/hnsw/mod.rs @@ -0,0 +1,11 @@ +// Referenced from HNSW (Malkov & Yashunin, 2018) +// https://arxiv.org/abs/1603.09320 + +pub mod index; +pub mod search; +pub mod types; + +pub use index::HnswIndex; + +#[cfg(test)] +mod tests; diff --git a/crates/index/src/hnsw/search.rs b/crates/index/src/hnsw/search.rs new file mode 100644 index 0000000..a3cde8b --- /dev/null +++ b/crates/index/src/hnsw/search.rs @@ -0,0 +1,281 @@ +use std::cmp::Reverse; +use std::collections::BinaryHeap; +use std::collections::HashSet; + +use defs::{OrdF32, PointId}; + +use crate::distance; + +use super::index::HnswIndex; + +impl HnswIndex { + /// Greedy search within a fixed layer + /// - start from `ep` and evaluate neighbors at `level` + /// - move to a neighbor only if it strictly improves distance + /// - stop when no improvement and return the last id + pub(super) fn greedy_search_layer(&self, ep: PointId, level: usize, query: &[f32]) -> PointId { + let mut current = ep; + loop { + let cur_vec = self.get_vec(current); + let mut best_score = distance(query.to_vec(), cur_vec.to_vec(), self.similarity); + let mut best_id = current; + + let empty: &[PointId] = &[]; + let neighbors = self + .index + .nodes + .get(¤t) + .and_then(|n| n.neighbors.get(level)) + .map(|v| v.as_slice()) + .unwrap_or(empty); + + for &n in neighbors { + if n == current { + continue; + } + // Skip deleted neighbors + if let Some(nn) = self.index.nodes.get(&n) { + if nn.deleted { + continue; + } + } + let n_vec = self.get_vec(n); + let score = distance(query.to_vec(), n_vec.to_vec(), self.similarity); + if score < best_score { + best_score = score; + best_id = n; + } + } + + if best_id == current { + break; + } + current = best_id; + } + current + } + + /// Best-first (ef) search used during insertion on a given layer + /// - maintain candidate queue and working set up to `ef_construction` + /// - expand the closest candidate; skip deleted nodes + /// - early-exit if the best candidate is worse than the worst in W when full + /// - return W as (id, distance) sorted by ascending distance + pub(super) fn search_layer_for_insert( + &self, + ep: PointId, + level: usize, + query: &[f32], + ef_construction: usize, + ) -> Vec<(PointId, f32)> { + let mut visited: HashSet = HashSet::new(); + + let mut candidates: BinaryHeap<(Reverse, PointId)> = BinaryHeap::new(); + let mut w_heap: BinaryHeap<(OrdF32, PointId)> = BinaryHeap::new(); + + // Seed with a non-deleted entry point + let seed = match self.index.nodes.get(&ep) { + Some(n) if !n.deleted && (n.level as usize) >= level => ep, + _ => self + .index + .nodes + .iter() + .filter(|(_, n)| !n.deleted && (n.level as usize) >= level) + .max_by(|a, b| a.1.level.cmp(&b.1.level).then_with(|| a.0.cmp(b.0))) + .map(|(id, _)| *id) + .unwrap_or(ep), + }; + + let ep_score = distance(query.to_vec(), self.get_vec(seed).to_vec(), self.similarity); + candidates.push((Reverse(OrdF32::new(ep_score)), seed)); + w_heap.push((OrdF32::new(ep_score), seed)); + visited.insert(seed); + + while let Some((Reverse(best_d), current)) = candidates.pop() { + if w_heap.len() >= ef_construction { + if let Some(&(worst_d, _)) = w_heap.peek() { + if best_d > worst_d { + break; + } + } + } + + let empty: &[PointId] = &[]; + let neighbors = self + .index + .nodes + .get(¤t) + .and_then(|n| n.neighbors.get(level)) + .map(|v| v.as_slice()) + .unwrap_or(empty); + + for &n in neighbors { + if visited.contains(&n) { + continue; + } + // Skip deleted neighbors + if let Some(nn) = self.index.nodes.get(&n) { + if nn.deleted { + continue; + } + } + + visited.insert(n); + let score = distance(query.to_vec(), self.get_vec(n).to_vec(), self.similarity); + let score = OrdF32::new(score); + candidates.push((Reverse(score), n)); + if w_heap.len() < ef_construction { + w_heap.push((score, n)); + } else if let Some(&(worst_d, _)) = w_heap.peek() { + if score < worst_d { + w_heap.pop(); + w_heap.push((score, n)); + } + } + } + } + + let mut w: Vec<(PointId, f32)> = w_heap + .into_iter() + .map(|(d, id)| (id, d.into_inner())) + .collect(); + w.sort_unstable_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + w + } + + /// Diversity-based neighbor selection (heuristic pruning) + /// - sort candidates by distance-to-new ascending + /// - accept a candidate unless it is dominated by an accepted one + /// - return up to `m` ids + pub(super) fn select_neighbors_heuristic( + &self, + candidates: &[(PointId, f32)], + m: usize, + ) -> Vec { + if candidates.is_empty() || m == 0 { + return Vec::new(); + } + let mut sorted = candidates.to_vec(); + sorted.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + + let mut result: Vec = Vec::with_capacity(m); + + 'outer: for &(cand_id, cand_dist_to_q) in &sorted { + let cand_vec = self.get_vec(cand_id); + for &r_id in &result { + let r_vec = self.get_vec(r_id); + let cand_to_r = distance(cand_vec.to_vec(), r_vec.to_vec(), self.similarity); + if cand_to_r < cand_dist_to_q { + continue 'outer; + } + } + result.push(cand_id); + if result.len() >= m { + break; + } + } + + result + } + + /// Connect new node `p` with `neighbors` on `level` + /// - ensure level storage exists + /// - merge and prune neighbor lists for `p` and each neighbor (cap by `m`/`M0`) + /// - skip linking into deleted nodes + pub(super) fn connect_bidirectional( + &mut self, + p: PointId, + neighbors: &[PointId], + level: usize, + m: usize, + ) { + self.merge_and_prune(p, level, neighbors, m); + + for &n in neighbors { + if n == p { + continue; + } + self.merge_and_prune(n, level, &[p], m); + } + } +} + +impl HnswIndex { + /// Pick a non-deleted entry point + pub(super) fn pick_entry(&self) -> Option { + if let Some(ep) = self.index.entry_point { + if self.index.nodes.get(&ep).is_some_and(|n| !n.deleted) { + return Some(ep); + } + } + self.index + .nodes + .iter() + .filter(|(_, n)| !n.deleted) + .max_by(|a, b| a.1.level.cmp(&b.1.level).then_with(|| a.0.cmp(b.0))) + .map(|(id, _)| *id) + } + + /// Ensure `node.neighbors[level]` exists + fn ensure_level(&mut self, id: PointId, level: usize) { + if let Some(node) = self.index.nodes.get(&id) { + if node.neighbors.len() > level { + return; + } + } + + let node = self.index.nodes.get_mut(&id).expect("node must exist"); + if node.neighbors.len() <= level { + node.neighbors.resize(level + 1, Vec::new()); + } + } + + /// Merge existing neighbors at `level` with `to_add`, then score by distance to `center`. + /// Sort ascending by distance and cap the final list to `cap` entries (no diversity heuristic). + /// Write the pruned list back to `center.neighbors[level]`. + fn merge_and_prune(&mut self, center: PointId, level: usize, to_add: &[PointId], cap: usize) { + self.ensure_level(center, level); + + let mut merged: Vec = { + let center_node = self.index.nodes.get_mut(¢er).unwrap(); + std::mem::take(&mut center_node.neighbors[level]) + }; + + let mut seen: HashSet = merged.iter().copied().collect(); + + for &n in to_add { + if n == center { + continue; + } + if !seen.insert(n) { + continue; + } + if let Some(nn) = self.index.nodes.get(&n) { + if nn.deleted { + continue; + } + } + merged.push(n); + } + + let center_vec = self.get_vec(center).to_vec(); + let mut scored: Vec<(PointId, f32)> = merged + .into_iter() + .map(|nid| { + let d = distance( + center_vec.clone(), + self.get_vec(nid).to_vec(), + self.similarity, + ); + (nid, d) + }) + .collect(); + scored.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap()); + scored.truncate(cap); + let new_list: Vec = scored.into_iter().map(|(nid, _)| nid).collect(); + + { + let center_node = self.index.nodes.get_mut(¢er).unwrap(); + center_node.neighbors[level] = new_list; + } + } +} diff --git a/crates/index/src/hnsw/tests.rs b/crates/index/src/hnsw/tests.rs new file mode 100644 index 0000000..07c9dd2 --- /dev/null +++ b/crates/index/src/hnsw/tests.rs @@ -0,0 +1,242 @@ +use super::*; +use crate::flat::FlatIndex; +use crate::VectorIndex; +use defs::{IndexedVector, Similarity}; +use uuid::Uuid; + +const ID1: Uuid = Uuid::from_u128(1); +const ID2: Uuid = Uuid::from_u128(2); +const ID3: Uuid = Uuid::from_u128(3); +const ID4: Uuid = Uuid::from_u128(4); +const ID5: Uuid = Uuid::from_u128(5); + +#[test] +fn test_entry_point_after_first_insert() { + let mut index = HnswIndex::new(Similarity::Euclidean, 2); + let v1 = IndexedVector { + id: ID1, + vector: vec![1.0, 0.0], + }; + assert!(index.insert(v1).is_ok()); + + // Entry point should be set to the first inserted id + assert_eq!(index.index.entry_point, Some(ID1)); + // Layer 0 should contain the point + assert!(index.index.points_by_layer.first().unwrap().contains(&ID1)); +} + +#[test] +fn test_connectivity_level0_after_two_inserts() { + let mut index = HnswIndex::new(Similarity::Euclidean, 2); + + let v1 = IndexedVector { + id: ID1, + vector: vec![0.0, 0.0], + }; + let v2 = IndexedVector { + id: ID2, + vector: vec![1.0, 0.0], + }; + + index.insert(v1).unwrap(); + index.insert(v2).unwrap(); + + assert!(index.index.nodes.contains_key(&ID1)); + assert!(index.index.nodes.contains_key(&ID2)); + + // There should be a base layer (level 0) + let n1 = index.index.nodes.get(&ID1).unwrap(); + let n2 = index.index.nodes.get(&ID2).unwrap(); + assert!(!n1.neighbors.is_empty()); + assert!(!n2.neighbors.is_empty()); + + // At level 0, each should have at least one neighbor; commonly connected to each other + let nbrs1_lvl0 = &n1.neighbors[0]; + let nbrs2_lvl0 = &n2.neighbors[0]; + assert!(!nbrs1_lvl0.is_empty()); + assert!(!nbrs2_lvl0.is_empty()); + + // This check is lenient(probabilistic): only asserts that at least one side linked to the other + let linked = nbrs1_lvl0.contains(&ID2) || nbrs2_lvl0.contains(&ID1); + assert!(linked); +} + +#[test] +fn test_search_matches_flat_small() { + let mut flat = FlatIndex::new(); + let mut hnsw = HnswIndex::new(Similarity::Euclidean, 2); + + let data = vec![ + IndexedVector { + id: ID1, + vector: vec![1.0, 0.0], + }, + IndexedVector { + id: ID2, + vector: vec![0.0, 1.0], + }, + IndexedVector { + id: ID3, + vector: vec![1.0, 1.0], + }, + IndexedVector { + id: ID4, + vector: vec![0.9, 0.1], + }, + IndexedVector { + id: ID5, + vector: vec![0.2, 0.8], + }, + ]; + + for v in data.clone() { + flat.insert(v.clone()).unwrap(); + hnsw.insert(v).unwrap(); + } + + let queries = vec![vec![1.0, 0.2], vec![0.1, 0.9]]; + let k = 2; + + for q in queries { + let flat_ids = flat.search(q.clone(), Similarity::Euclidean, k).unwrap(); + let hnsw_ids = hnsw.search(q.clone(), Similarity::Euclidean, k).unwrap(); + + //both return the same number of results and that HNSW matches Flat for this tiny dataset + assert_eq!(hnsw_ids.len(), k.min(flat_ids.len())); + assert_eq!(hnsw_ids, flat_ids); + } +} + +#[test] +fn test_search_empty_index_returns_empty() { + let index = HnswIndex::new(Similarity::Euclidean, 2); + let res = index + .search(vec![0.0, 0.0], Similarity::Euclidean, 3) + .unwrap(); + assert!(res.is_empty()); +} + +#[test] +fn test_search_k_zero_returns_empty() { + let mut index = HnswIndex::new(Similarity::Euclidean, 2); + index + .insert(IndexedVector { + id: ID1, + vector: vec![0.0, 0.0], + }) + .unwrap(); + let res = index + .search(vec![0.0, 0.0], Similarity::Euclidean, 0) + .unwrap(); + assert!(res.is_empty()); +} + +#[test] +fn test_search_cosine_normalization_basic() { + let mut index = HnswIndex::new(Similarity::Cosine, 2); + index + .insert(IndexedVector { + id: ID1, + vector: vec![1.0, 0.0], + }) + .unwrap(); + index + .insert(IndexedVector { + id: ID2, + vector: vec![0.0, 1.0], + }) + .unwrap(); + let res = index + .search(vec![10.0, 0.0], Similarity::Cosine, 1) + .unwrap(); + assert_eq!(res, vec![ID1]); +} + +#[test] +fn test_soft_delete_and_search_skip() { + let mut index = HnswIndex::new(Similarity::Euclidean, 2); + index + .insert(IndexedVector { + id: ID1, + vector: vec![0.0, 0.0], + }) + .unwrap(); + index + .insert(IndexedVector { + id: ID2, + vector: vec![1.0, 0.0], + }) + .unwrap(); + index + .insert(IndexedVector { + id: ID3, + vector: vec![0.0, 1.0], + }) + .unwrap(); + + let existed = index.delete(ID2).unwrap(); + assert!(existed); + let n2 = index.index.nodes.get(&ID2).expect("node 2 must exist"); + assert!(n2.deleted); + + // Search near id=2 should not return 2 + let res = index + .search(vec![0.9, 0.1], Similarity::Euclidean, 2) + .unwrap(); + assert!(!res.contains(&ID2)); + + // Deleting a non-existent id returns false + let existed = index.delete(ID4).unwrap(); + assert!(!existed); + + // If entry point was 2, it should be updated to a non-deleted id + if let Some(ep) = index.index.entry_point { + if ep == ID2 { + panic!("entry point should have been moved off deleted id"); + } + } +} + +#[test] +fn test_stats_and_deleted_ratio() { + let mut index = HnswIndex::new(Similarity::Euclidean, 2); + index + .insert(IndexedVector { + id: ID1, + vector: vec![0.0, 0.0], + }) + .unwrap(); + index + .insert(IndexedVector { + id: ID2, + vector: vec![1.0, 0.0], + }) + .unwrap(); + index + .insert(IndexedVector { + id: ID3, + vector: vec![0.0, 1.0], + }) + .unwrap(); + index + .insert(IndexedVector { + id: ID4, + vector: vec![1.0, 1.0], + }) + .unwrap(); + + index.delete(ID2).unwrap(); + + let stats = index.stats(); + assert_eq!(stats.alive + stats.deleted, index.index.nodes.len()); + assert_eq!(stats.deleted, 1); + assert_eq!(stats.alive, index.index.nodes.len() - 1); + + // Ratio should be > 0 and <= 0.5 for 1/4 deleted + let ratio = index.deleted_ratio(); + assert!(ratio > 0.0 && ratio <= 0.5, "ratio was {ratio}"); + + // Histogram sums to alive count + let sum_hist: usize = stats.level_histogram.values().sum(); + assert_eq!(sum_hist, stats.alive); +} diff --git a/crates/index/src/hnsw/types.rs b/crates/index/src/hnsw/types.rs new file mode 100644 index 0000000..24d54ea --- /dev/null +++ b/crates/index/src/hnsw/types.rs @@ -0,0 +1,73 @@ +use std::collections::HashMap; + +use defs::PointId; +use rand::Rng; + +// Compact storage for layered points and adjacency used by `HnswIndex`. +pub struct PointIndexation { + // Max connections per point per layer (M) + pub max_connections: usize, + // Max edges per node on layer 0 (often 2*M) + pub max_connections_0: usize, + // Maximum number of layers + pub max_layer: usize, + // Points per layer; each inner Vec holds the PointId(s) + pub points_by_layer: Vec>, + // Per-node, per-level neighbor lists (bounded by M/M0) + pub nodes: HashMap, + // Number of points inserted + pub nb_point: usize, + // Optional entry point used for searches/insertions + pub entry_point: Option, + // Level generator used to sample random levels + pub level_generator: LevelGenerator, +} + +// Node with highest level and per-level neighbor lists +pub struct Node { + pub id: PointId, + // Highest level (0-based; level 0 is the base layer) + pub level: u8, + // neighbors[level] -> neighbor PointIds at that level + pub neighbors: Vec>, + // Soft-delete flag (skipped by traversals) + pub deleted: bool, +} + +// Level sampling parameters +pub struct LevelGenerator { + // 1 / ln(M) + pub level_scale: f64, +} + +impl LevelGenerator { + pub fn from_m(m: usize) -> Self { + assert!(m >= 2, "LevelGenerator::from_m: m must be >= 2"); + let level_scale = 1.0 / (m as f64).ln(); + Self { level_scale } + } + + /// Sample a level `L` from an exponential tail: P(L ≥ l) ≈ exp(-l / ln M). + /// Uses inverse transform: L = floor(-ln(U) * (1/ln M)), capped to `max_layer - 1`. + pub fn sample_level(&self, rng: &mut R, max_layer: usize) -> u8 { + let mut u: f64 = rng.random(); + if u <= 0.0 { + u = f64::EPSILON; + } + if u >= 1.0 { + u = 1.0 - f64::EPSILON; + } + + let raw = (-u.ln()) * self.level_scale; + let l = raw.floor() as usize; + let capped = l.min(max_layer.saturating_sub(1)); + capped as u8 + } +} + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct HnswStats { + pub alive: usize, + pub deleted: usize, + pub level_histogram: std::collections::BTreeMap, +} diff --git a/crates/index/src/lib.rs b/crates/index/src/lib.rs index cd363b0..5652688 100644 --- a/crates/index/src/lib.rs +++ b/crates/index/src/lib.rs @@ -1,6 +1,7 @@ use defs::{DbError, DenseVector, IndexedVector, PointId, Similarity}; pub mod flat; +pub mod hnsw; pub trait VectorIndex: Send + Sync { fn insert(&mut self, vector: IndexedVector) -> Result<(), DbError>; @@ -53,7 +54,7 @@ pub fn distance(a: DenseVector, b: DenseVector, dist_type: Similarity) -> f32 { let q = q_score.iter().sum::().sqrt(); let r_score: Vec = b.iter().map(|&n| n * n).collect(); let r = r_score.iter().sum::().sqrt(); - 1.0 - (p / (q * r)) + 1.0 - p / (q * r) } } }