fix: WAR for under-allocation in torch VSWA kvcachemanager

qixiang-99 · qixiang-99 · commit c191b38a48d2 · 2025-07-11T16:29:13.000-07:00
Signed-off-by: qixiang-99 &lt;203170375+qixiang-99@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h
@@ -1020,7 +1020,7 @@ class KvCacheConfig
     void setEnableBlockReuse(bool enableBlockReuse);
     void setEnablePartialReuse(bool enablePartialReuse);
     void setCopyOnPartialReuse(bool copyOnPartialReuse);
-    void setMaxTokens(SizeType32 maxTokens);
+    void setMaxTokens(std::optional<SizeType32> maxTokens);
     void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec);
     void setSinkTokenLength(SizeType32 sinkTokenLength);
     void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
diff --git a/cpp/tensorrt_llm/executor/kvCacheConfig.cpp b/cpp/tensorrt_llm/executor/kvCacheConfig.cpp
@@ -143,9 +143,12 @@ void KvCacheConfig::setCopyOnPartialReuse(bool copyOnPartialReuse)
     mCopyOnPartialReuse = copyOnPartialReuse;
 }
 
-void KvCacheConfig::setMaxTokens(SizeType32 maxTokens)
+void KvCacheConfig::setMaxTokens(std::optional<SizeType32> maxTokens)
 {
-    TLLM_CHECK(maxTokens > 0);
+    if (maxTokens)
+    {
+        TLLM_CHECK(maxTokens.value() > 0);
+    }
     mMaxTokens = maxTokens;
 }
 
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -206,6 +206,9 @@ def __init__(
             assert isinstance(
                 kv_cache_config, KvCacheConfigCpp
             ), "calculate_max_num_blocks_from_cpp only accepts KvCacheConfigCpp"
+
+            # overwrite max_tokens in VSWA case
+            kv_cache_config.max_tokens = None
             blocks_per_window = self.calculate_max_num_blocks_from_cpp(
                 kv_cache_config=kv_cache_config,
                 model_config=model_config,
@@ -633,7 +636,7 @@ def calculate_max_num_blocks_from_cpp(
         logger.debug(f"window_size_to_layers: {window_size_to_layers}")
 
         free_mem, total_mem = torch.cuda.mem_get_info()
-        primary_pool_memory_bytes = free_mem
+        primary_pool_memory_bytes = int(free_mem * 0.9)
         secondary_pool_memory_bytes = 0
         logger.debug(
             f"primary_pool_memory_bytes is set to {primary_pool_memory_bytes/1024**3}GB, \nsecondary_pool_memory_bytes is set to {secondary_pool_memory_bytes/1024**3}GB"

Original file line number	Diff line number	Diff line change
`@@ -143,9 +143,12 @@ void KvCacheConfig::setCopyOnPartialReuse(bool copyOnPartialReuse)`
`143`	`143`	`mCopyOnPartialReuse = copyOnPartialReuse;`
`144`	`144`	`}`
`145`	`145`
`146`		`-void KvCacheConfig::setMaxTokens(SizeType32 maxTokens)`
	`146`	`+void KvCacheConfig::setMaxTokens(std::optional<SizeType32> maxTokens)`
`147`	`147`	`{`
`148`		`- TLLM_CHECK(maxTokens > 0);`
	`148`	`+ if (maxTokens)`
	`149`	`+ {`
	`150`	`+ TLLM_CHECK(maxTokens.value() > 0);`
	`151`	`+ }`
`149`	`152`	`mMaxTokens = maxTokens;`
`150`	`153`	`}`
`151`	`154`