From 163367d6eb896303ebfc82786a1921814e275206 Mon Sep 17 00:00:00 2001 From: Abi Date: Wed, 20 May 2026 10:11:56 -0700 Subject: [PATCH] perf: lazy-import torch and tiktoken in embedding_compute / chat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit embedding_compute.py:14-16 and chat.py:13 import torch / tiktoken at module top, which means `import leann` pulls ~1 GB of torch state even for callers that only do MCP search over a prebuilt index, BM25-only queries, or other paths that never touch the embedding pipeline. Moved torch into the two functions that actually use it (compute_embeddings_sentence_transformers, HFLLM.ask). The lazy imports in HFLLM.__init__ and compute_embeddings_ollama were already function-local, so they're unchanged. Moved tiktoken into truncate_to_token_limit. `import leann` drops from ~6700ms to ~128ms locally; torch and tiktoken stay out of sys.modules until first real use. I'm assuming the eager imports were just convenience and not load-bearing in any way I'm missing (e.g. catching ImportError up-front for a clearer error message). Happy to revisit if there's a reason they need to be loaded early. I didn't find an existing issue for this — happy to open one if you'd prefer that path. --- packages/leann-core/src/leann/chat.py | 4 ++-- packages/leann-core/src/leann/embedding_compute.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/packages/leann-core/src/leann/chat.py b/packages/leann-core/src/leann/chat.py index 52426d13..a36383f5 100644 --- a/packages/leann-core/src/leann/chat.py +++ b/packages/leann-core/src/leann/chat.py @@ -10,8 +10,6 @@ from abc import ABC, abstractmethod from typing import Any, Optional, cast -import torch - from .settings import ( resolve_anthropic_api_key, resolve_anthropic_base_url, @@ -720,6 +718,8 @@ def ask(self, prompt: str, **kwargs) -> str: logger.info(f"Generating with HuggingFace model, config: {generation_config}") # Generate + import torch + with torch.no_grad(): outputs = self.model.generate(**inputs, **generation_config) diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py index ac8488b0..62b3cc5d 100644 --- a/packages/leann-core/src/leann/embedding_compute.py +++ b/packages/leann-core/src/leann/embedding_compute.py @@ -12,11 +12,13 @@ from typing import Any, Optional, Protocol, cast import numpy as np -import tiktoken -import torch from .settings import resolve_ollama_host, resolve_openai_api_key, resolve_openai_base_url +# torch and tiktoken are imported lazily inside the functions that use them, so +# `import leann` (e.g. for MCP search over an existing index, BM25-only flows, +# or non-embedding utilities) doesn't pull torch's ~1 GB of state into memory. + # Set up logger with proper level logger = logging.getLogger(__name__) LOG_LEVEL = os.getenv("LEANN_LOG_LEVEL", "WARNING").upper() @@ -145,6 +147,8 @@ def truncate_to_token_limit(texts: list[str], token_limit: int) -> list[str]: if not texts: return [] + import tiktoken + # Use tiktoken with cl100k_base encoding enc = tiktoken.get_encoding("cl100k_base") @@ -419,6 +423,8 @@ def compute_embeddings_sentence_transformers( is_build: Whether this is a build operation (shows progress bar) adaptive_optimization: Whether to use adaptive optimization based on batch size """ + import torch + outer_start_time = time.time() # Handle empty input if not texts: