diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index e22633ad..6ac629a3 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -30,6 +30,15 @@ logger = logging.getLogger(__name__) +# Passage ID schemes recorded in .meta.json["passage_id_scheme"]. +# - "sequential": today's default; IDs are str(insertion_index) (api.py:add_text). +# - "content-hash": planned in #329; IDs are sha256(text)[:16], stable across +# file moves and reorderings. +# Older indexes have no passage_id_scheme field — readers must default to +# "sequential" when the key is absent. See #329 for the rollout plan. +PASSAGE_ID_SCHEME_SEQUENTIAL = "sequential" +PASSAGE_ID_SCHEME_CONTENT_HASH = "content-hash" + def get_registered_backends() -> list[str]: """Get list of registered backend names.""" @@ -570,12 +579,13 @@ def build_index(self, index_path: str): builder_instance.build(embeddings, string_ids, index_path, **current_backend_kwargs) leann_meta_path = index_dir / f"{index_name}.meta.json" meta_data = { - "version": "1.0", + "version": "1.1", "backend_name": self.backend_name, "embedding_model": self.embedding_model, "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, + "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, "passage_sources": [ { "type": "jsonl", @@ -714,12 +724,13 @@ def build_index_from_arrays(self, index_path: str, ids: list, embeddings: np.nda # Create metadata file leann_meta_path = index_dir / f"{index_name}.meta.json" meta_data = { - "version": "1.0", + "version": "1.1", "backend_name": self.backend_name, "embedding_model": self.embedding_model, "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, + "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, "passage_sources": [ { "type": "jsonl", diff --git a/tests/test_passage_id_scheme.py b/tests/test_passage_id_scheme.py new file mode 100644 index 00000000..33bc9710 --- /dev/null +++ b/tests/test_passage_id_scheme.py @@ -0,0 +1,45 @@ +import json + +import numpy as np +from leann.api import PASSAGE_ID_SCHEME_SEQUENTIAL, LeannBuilder + + +class _FakeBackendFactory: + class _Builder: + def build(self, embeddings, ids, index_path, **kwargs): + return None + + def builder(self, **kwargs): + return self._Builder() + + +def test_build_index_records_default_passage_id_scheme(tmp_path, monkeypatch): + monkeypatch.setattr( + "leann.api.compute_embeddings", + lambda chunks, *args, **kwargs: np.ones((len(chunks), 2), dtype=np.float32), + ) + builder = LeannBuilder(backend_name="hnsw", dimensions=2) + builder.backend_factory = _FakeBackendFactory() + builder.add_text("alpha") + + index_path = tmp_path / "documents.leann" + builder.build_index(str(index_path)) + + meta = json.loads((tmp_path / "documents.leann.meta.json").read_text(encoding="utf-8")) + assert meta["passage_id_scheme"] == PASSAGE_ID_SCHEME_SEQUENTIAL + + +def test_build_index_from_arrays_records_default_passage_id_scheme(tmp_path): + builder = LeannBuilder(backend_name="hnsw", dimensions=2) + builder.backend_factory = _FakeBackendFactory() + builder.add_text("alpha") + + index_path = tmp_path / "documents.leann" + builder.build_index_from_arrays( + str(index_path), + ids=["0"], + embeddings=np.ones((1, 2), dtype=np.float32), + ) + + meta = json.loads((tmp_path / "documents.leann.meta.json").read_text(encoding="utf-8")) + assert meta["passage_id_scheme"] == PASSAGE_ID_SCHEME_SEQUENTIAL