diff --git a/worker-sys/src/types.rs b/worker-sys/src/types.rs index 76235254..5417b1ca 100644 --- a/worker-sys/src/types.rs +++ b/worker-sys/src/types.rs @@ -14,6 +14,7 @@ mod r2; mod schedule; mod socket; mod tls_client_auth; +mod vectorize; mod version; mod websocket_pair; @@ -33,5 +34,6 @@ pub use r2::*; pub use schedule::*; pub use socket::*; pub use tls_client_auth::*; +pub use vectorize::*; pub use version::*; pub use websocket_pair::*; diff --git a/worker-sys/src/types/vectorize.rs b/worker-sys/src/types/vectorize.rs new file mode 100644 index 00000000..f1525806 --- /dev/null +++ b/worker-sys/src/types/vectorize.rs @@ -0,0 +1,30 @@ +use wasm_bindgen::prelude::*; + +#[wasm_bindgen] +extern "C" { + #[wasm_bindgen(extends=js_sys::Object)] + #[derive(Debug, Clone, PartialEq, Eq)] + pub type Vectorize; + + #[wasm_bindgen(method, catch)] + pub fn insert(this: &Vectorize, vectors: js_sys::Object) -> Result; + + #[wasm_bindgen(method, catch)] + pub fn upsert(this: &Vectorize, vectors: js_sys::Object) -> Result; + + #[wasm_bindgen(method, catch)] + pub fn describe(this: &Vectorize) -> Result; + + #[wasm_bindgen(method, catch)] + pub fn query( + this: &Vectorize, + vector: JsValue, + options: js_sys::Object, + ) -> Result; + + #[wasm_bindgen(method, catch, js_name = "getByIds")] + pub fn get_by_ids(this: &Vectorize, ids: JsValue) -> Result; + + #[wasm_bindgen(method, catch, js_name = "deleteByIds")] + pub fn delete_by_ids(this: &Vectorize, ids: JsValue) -> Result; +} diff --git a/worker/src/env.rs b/worker/src/env.rs index fe79d03a..f9c653a5 100644 --- a/worker/src/env.rs +++ b/worker/src/env.rs @@ -4,6 +4,7 @@ use std::fmt::Display; use crate::d1::D1Database; #[cfg(feature = "queue")] use crate::Queue; +use crate::Vectorize; use crate::{durable::ObjectNamespace, Bucket, DynamicDispatcher, Fetcher, Result}; use crate::{error::Error, hyperdrive::Hyperdrive}; @@ -89,6 +90,10 @@ impl Env { pub fn hyperdrive(&self, binding: &str) -> Result { self.get_binding(binding) } + + pub fn vectorize(&self, binding: &str) -> Result { + self.get_binding(binding) + } } pub trait EnvBinding: Sized + JsCast { diff --git a/worker/src/lib.rs b/worker/src/lib.rs index bbaac220..b2741289 100644 --- a/worker/src/lib.rs +++ b/worker/src/lib.rs @@ -190,6 +190,7 @@ pub use crate::router::{RouteContext, RouteParams, Router}; pub use crate::schedule::*; pub use crate::socket::*; pub use crate::streams::*; +pub use crate::vectorize::*; pub use crate::version::*; pub use crate::websocket::*; @@ -226,6 +227,7 @@ mod schedule; pub mod send; mod socket; mod streams; +mod vectorize; mod version; mod websocket; diff --git a/worker/src/vectorize.rs b/worker/src/vectorize.rs new file mode 100644 index 00000000..398f74df --- /dev/null +++ b/worker/src/vectorize.rs @@ -0,0 +1,326 @@ +use std::collections::HashMap; + +use crate::{send::SendFuture, EnvBinding, Result}; +use serde::{Deserialize, Serialize}; +use wasm_bindgen::{JsCast, JsValue}; +use wasm_bindgen_futures::JsFuture; +use worker_sys::types::Vectorize as VectorizeSys; + +/// Supported distance metrics for an index. +/// Distance metrics determine how other "similar" vectors are determined. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "kebab-case")] +pub enum VectorizeDistanceMetric { + Euclidean, + Cosine, + DotProduct, +} + +/// Information about the configuration of an index. +#[derive(Debug, Deserialize)] +#[serde(untagged)] +pub enum VectorizeIndexConfig { + Preset { + preset: String, + }, + Custom { + dimensions: u16, + metric: VectorizeDistanceMetric, + }, +} + +/// Metadata about an existing index. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct VectorizeIndexInfo { + /// The number of records containing vectors within the index. + pub vector_count: u64, + /// Number of dimensions the index has been configured for. + pub dimensions: u32, + /// ISO 8601 datetime of the last processed mutation on in the index. All changes before this mutation will be reflected in the index state. + #[serde(skip_serializing_if = "Option::is_none")] + pub processed_up_to_datetime: Option, + /// UUIDv4 of the last mutation processed by the index. All changes before this mutation will be reflected in the index state. + #[serde(skip_serializing_if = "Option::is_none")] + pub processed_up_to_mutation: Option, +} + +/// Results of an operation that performed a mutation on a set of vectors. +#[derive(Debug, Deserialize)] +#[serde(rename_all = "camelCase")] +pub struct VectorizeVectorAsyncMutation { + /// The unique identifier for the async mutation operation containing the changeset. + pub mutation_id: String, +} + +/// Represents a single vector value set along with its associated metadata. +#[derive(Debug, Serialize)] +pub struct VectorizeVector<'a> { + /// The ID for the vector. This can be user-defined, and must be unique. It should uniquely identify the object, and is best set based on the ID of what the vector represents. + id: String, + /// The vector values. + values: &'a [f32], + /// The namespace this vector belongs to. + namespace: Option, + /// Metadata associated with the vector. Includes the values of other fields and potentially additional details. + metadata: serde_json::Map, +} + +impl<'a> VectorizeVector<'a> { + pub fn new(id: &str, values: &'a [f32]) -> Self { + Self { + id: id.to_owned(), + values, + namespace: None, + metadata: serde_json::Map::new(), + } + } + + pub fn with_namespace(mut self, namespace: String) -> Self { + self.namespace = Some(namespace); + self + } + + pub fn with_metadata_entry(mut self, key: &str, value: V) -> Result { + self.metadata + .insert(key.to_owned(), serde_json::to_value(value)?); + Ok(self) + } +} + +/// Metadata return levels for a Vectorize query. +#[derive(Debug, Default, Serialize)] +#[serde(rename_all = "kebab-case")] +pub enum VectorizeMetadataRetrievalLevel { + /// Full metadata for the vector return set, including all fields (including those un-indexed) without truncation. This is a more expensive retrieval, as it requires additional fetching & reading of un-indexed data. + All, + /// Return all metadata fields configured for indexing in the vector return set. This level of retrieval is "free" in that no additional overhead is incurred returning this data. However, note that indexed metadata is subject to truncation (especially for larger strings). + Indexed, + /// No indexed metadata will be returned. + #[default] + None, +} + +/// Comparison logic/operation to use for metadata filtering. +/// +/// This list is expected to grow as support for more operations are released. +#[derive(Debug, Serialize, Hash, PartialEq, Eq)] +pub enum VectorizeVectorMetadataFilterOp { + #[serde(rename = "$eq")] + Eq, + #[serde(rename = "$ne")] + Neq, +} + +/// Filter criteria for vector metadata used to limit the retrieved query result set. +type VectorizeVectorMetadataFilter = + HashMap>; + +#[derive(Debug, Serialize)] +#[serde(rename_all = "camelCase")] +pub struct VectorizeQueryOptions { + // Default 5, max 100 + top_k: u8, + /// Return vectors from the specified namespace. Default `none`. + namespace: Option, + /// Return vector values. Default `false`. + return_values: bool, + /// Return vector metadata. Default `None`. + return_metadata: VectorizeMetadataRetrievalLevel, + /// Default `none`. + filter: Option, +} + +impl VectorizeQueryOptions { + pub fn new() -> Self { + Self::default() + } + + pub fn with_top_k(mut self, top_k: u8) -> Self { + self.top_k = top_k; + self + } + + pub fn with_namespace(mut self, namespace: &str) -> Self { + self.namespace = Some(namespace.to_owned()); + self + } + + pub fn with_return_values(mut self, return_values: bool) -> Self { + self.return_values = return_values; + self + } + + pub fn with_return_metadata( + mut self, + return_metadata: VectorizeMetadataRetrievalLevel, + ) -> Self { + self.return_metadata = return_metadata; + self + } + + pub fn with_filter_entry( + mut self, + key: &str, + op: VectorizeVectorMetadataFilterOp, + value: T, + ) -> Result { + let mut filter = self.filter.unwrap_or_default(); + let inner = filter.entry(key.to_owned()).or_default(); + inner.insert(op, serde_json::to_value(value)?); + self.filter = Some(filter); + Ok(self) + } +} + +impl Default for VectorizeQueryOptions { + fn default() -> Self { + Self { + top_k: 5, + namespace: None, + return_values: false, + return_metadata: VectorizeMetadataRetrievalLevel::None, + filter: None, + } + } +} + +/// Represents a single vector value set along with its associated metadata. +#[derive(Debug, Deserialize)] +pub struct VectorizeVectorResult { + /// The ID for the vector. This can be user-defined, and must be unique. It should uniquely identify the object, and is best set based on the ID of what the vector represents. + pub id: String, + /// The vector values. + pub values: Option>, + /// Metadata associated with the vector. Includes the values of other fields and potentially additional details. + pub metadata: Option>, + /** The namespace the vector belongs to. */ + pub namespace: Option, +} + +#[derive(Debug, Deserialize)] +pub struct VectorizeMatchVector { + #[serde(flatten)] + pub vector: VectorizeVectorResult, + /// The score or rank for similarity, when returned as a result + pub score: Option, +} + +/// A set of matching [VectorizeMatchVector] for a particular query. +#[derive(Debug, Deserialize)] +pub struct VectorizeMatches { + pub matches: Vec, + pub count: u64, +} + +/// A Vectorize Vector Search Index for querying vectors/embeddings. +pub struct Vectorize(VectorizeSys); + +unsafe impl Send for Vectorize {} +unsafe impl Sync for Vectorize {} + +impl EnvBinding for Vectorize { + const TYPE_NAME: &'static str = "VectorizeImpl"; +} + +impl Vectorize { + /// Get information about the currently bound index. + pub async fn describe(&self) -> Result { + let promise = self.0.describe()?; + let fut = SendFuture::new(JsFuture::from(promise)); + let details = fut.await?; + Ok(serde_wasm_bindgen::from_value(details)?) + } + + /// Insert a list of vectors into the index dataset. If a provided id exists, an error will be thrown. + pub async fn insert<'a>( + &self, + vectors: &[VectorizeVector<'a>], + ) -> Result { + let promise = self + .0 + .insert(serde_wasm_bindgen::to_value(&vectors)?.into())?; + let fut = SendFuture::new(JsFuture::from(promise)); + let mutation = fut.await?; + Ok(serde_wasm_bindgen::from_value(mutation)?) + } + + /// Upsert a list of vectors into the index dataset. If a provided id exists, it will be replaced with the new values. + pub async fn upsert<'a>( + &self, + vectors: &[VectorizeVector<'a>], + ) -> Result { + let promise = self + .0 + .upsert(serde_wasm_bindgen::to_value(&vectors)?.into())?; + let fut = SendFuture::new(JsFuture::from(promise)); + let mutation = fut.await?; + Ok(serde_wasm_bindgen::from_value(mutation)?) + } + + /// Use the provided vector to perform a similarity search across the index. + pub async fn query( + &self, + vector: JsValue, + options: VectorizeQueryOptions, + ) -> Result { + let opts = serde_wasm_bindgen::to_value(&options)?; + let promise = self.0.query(vector, opts.into())?; + let fut = SendFuture::new(JsFuture::from(promise)); + let matches = fut.await?; + Ok(serde_wasm_bindgen::from_value(matches)?) + } + + /// Delete a list of vectors with a matching id. + pub async fn delete_by_ids<'a, T>(&self, ids: T) -> Result + where + T: IntoIterator, + { + // TODO: Can we avoid this allocation? + let ids: Vec = ids.into_iter().map(|id| id.to_string()).collect(); + let arg = serde_wasm_bindgen::to_value(&ids)?; + let promise = self.0.delete_by_ids(arg)?; + let fut = SendFuture::new(JsFuture::from(promise)); + let mutation = fut.await?; + Ok(serde_wasm_bindgen::from_value(mutation)?) + } + + /// Get a list of vectors with a matching id. + pub async fn get_by_ids<'a, T>(&self, ids: T) -> Result> + where + T: IntoIterator, + { + let ids: Vec = ids.into_iter().map(|id| id.to_string()).collect(); + let arg = serde_wasm_bindgen::to_value(&ids)?; + let promise = self.0.get_by_ids(arg)?; + let fut = SendFuture::new(JsFuture::from(promise)); + let vectors = fut.await?; + Ok(serde_wasm_bindgen::from_value(vectors)?) + } +} + +impl JsCast for Vectorize { + fn instanceof(val: &JsValue) -> bool { + val.is_instance_of::() + } + + fn unchecked_from_js(val: JsValue) -> Self { + Self(val.into()) + } + + fn unchecked_from_js_ref(val: &JsValue) -> &Self { + unsafe { &*(val as *const JsValue as *const Self) } + } +} + +impl From for JsValue { + fn from(index: Vectorize) -> Self { + JsValue::from(index.0) + } +} + +impl AsRef for Vectorize { + fn as_ref(&self) -> &JsValue { + &self.0 + } +}