From 2a308f28104fe1b456c3d448771cd64822d9a35a Mon Sep 17 00:00:00 2001 From: Abi Date: Wed, 20 May 2026 11:07:11 -0700 Subject: [PATCH] refactor: record passage_id_scheme in meta.json (default "sequential") MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Sub-PR 1 of 5 from the plan in #329. Purely additive — no behavior change for any caller, existing index loaders ignore the field. Writes a new `passage_id_scheme: "sequential"` field into the .meta.json produced by both build_index and build_index_from_arrays. Bumps version to "1.1" for human-inspectable schema tracking (no code reads version today, so the bump is safe). Module-level constants PASSAGE_ID_SCHEME_SEQUENTIAL / _CONTENT_HASH document the value space; the content-hash scheme itself ships in sub-PR 2. --- packages/leann-core/src/leann/api.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/packages/leann-core/src/leann/api.py b/packages/leann-core/src/leann/api.py index 1dfda90d..23071930 100644 --- a/packages/leann-core/src/leann/api.py +++ b/packages/leann-core/src/leann/api.py @@ -31,6 +31,15 @@ logger = logging.getLogger(__name__) +# Passage ID schemes recorded in .meta.json["passage_id_scheme"]. +# - "sequential": today's default; IDs are str(insertion_index) (api.py:add_text). +# - "content-hash": planned in #329; IDs are sha256(text)[:16], stable across +# file moves and reorderings. +# Older indexes have no passage_id_scheme field — readers must default to +# "sequential" when the key is absent. See #329 for the rollout plan. +PASSAGE_ID_SCHEME_SEQUENTIAL = "sequential" +PASSAGE_ID_SCHEME_CONTENT_HASH = "content-hash" + def get_registered_backends() -> list[str]: """Get list of registered backend names.""" @@ -683,12 +692,13 @@ def build_index(self, index_path: str): builder_instance.build(embeddings, string_ids, index_path, **current_backend_kwargs) leann_meta_path = index_dir / f"{index_name}.meta.json" meta_data = { - "version": "1.0", + "version": "1.1", "backend_name": self.backend_name, "embedding_model": self.embedding_model, "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, + "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, "passage_sources": [ { "type": "jsonl", @@ -847,12 +857,13 @@ def build_index_from_arrays(self, index_path: str, ids: list, embeddings: np.nda # Create metadata file leann_meta_path = index_dir / f"{index_name}.meta.json" meta_data = { - "version": "1.0", + "version": "1.1", "backend_name": self.backend_name, "embedding_model": self.embedding_model, "dimensions": self.dimensions, "backend_kwargs": self.backend_kwargs, "embedding_mode": self.embedding_mode, + "passage_id_scheme": PASSAGE_ID_SCHEME_SEQUENTIAL, "passage_sources": [ { "type": "jsonl",