From 3dcc23b27f0dcdd09d068a6f21b946fc82eed432 Mon Sep 17 00:00:00 2001
From: yunruis <205571022+yunruis@users.noreply.github.com>
Date: Mon, 3 Nov 2025 14:46:01 +0800
Subject: [PATCH 01/26] [https://nvbugs/5606268][fix] Fix program exit segment
 fault triggered CublasMMWarpper deconstructor (#8834)

Signed-off-by: yunruis <205571022+yunruis@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 cpp/tensorrt_llm/common/opUtils.cpp | 28 ++++++++++++++++++++++------
 1 file changed, 22 insertions(+), 6 deletions(-)
diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp
index 053f9d9ece7..ae3810a255f 100644
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator
     PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter)
         : mCreator{std::move(creator)}
         , mDeleter{std::move(deleter)}
+        , mObservers{new std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>()}
     {
     }
 
+    ~PerCudaCtxPerThreadSingletonCreator()
+    {
+        std::lock_guard<std::mutex> lk{mMutex};
+        delete mObservers;
+        mObservers = nullptr;
+    }
+
     std::shared_ptr<T> operator()()
     {
         std::lock_guard<std::mutex> lk{mMutex};
         CUcontext ctx{getCurrentCudaCtx()};
         std::thread::id thread = std::this_thread::get_id();
         auto const key = std::make_tuple(ctx, thread);
-        std::shared_ptr<T> result = mObservers[key].lock();
+        std::shared_ptr<T> result = (*mObservers)[key].lock();
         if (result == nullptr)
         {
             TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread);
@@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator
                     }
                     mDeleter(obj);
 
+                    if (mObservers == nullptr)
+                    {
+                        return;
+                    }
+
                     // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts
                     // frequently.
                     std::shared_ptr<T> observedObjHolder; // Delay destroy to avoid dead lock.
@@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator
                     // thread just before we lock mMutex. We can't infer that the observer is stale from the fact that
                     // obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one
                     // atomic operation, and the observer may be changed to observe another instance.
-                    if (mObservers.find(key) == mObservers.end())
+                    auto it = mObservers->find(key);
+                    if (it == mObservers->end())
                     {
                         return;
                     }
-                    observedObjHolder = mObservers.at(key).lock();
+                    observedObjHolder = it->second.lock();
                     if (observedObjHolder == nullptr)
                     {
-                        mObservers.erase(key);
+                        mObservers->erase(it);
                     }
                 }};
-            mObservers.at(key) = result;
+            (*mObservers)[key] = result;
         }
         else
         {
@@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator
     mutable std::mutex mMutex;
     // CUDA resources are per-context and per-thread.
     using CacheKey = std::tuple<CUcontext, std::thread::id>;
-    std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>> mObservers;
+    std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>* mObservers;
 };
 
 } // namespace
@@ -253,6 +267,7 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
         {
             TLLM_CUDA_CHECK(cublasDestroy(*handle));
             delete handle;
+            handle = nullptr;
         });
     return creator();
 }
@@ -270,6 +285,7 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
         {
             TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
             delete handle;
+            handle = nullptr;
         });
     return creator();
 }

From 13be0b7aa3acfa9c15cbb8491279b04e2eeb9cc9 Mon Sep 17 00:00:00 2001
From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com>
Date: Mon, 3 Nov 2025 15:09:58 +0800
Subject: [PATCH 02/26] [https://nvbugs/5608930][fix] Unwaive test 5608930
 (#8831)

Signed-off-by: qgai <qgai@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 248c7e58bf1..da7ccc01cea 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -358,7 +358,6 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKI
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)
 disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682)
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584)
-accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill SKIP (https://nvbugs/5608930)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)

From f8a9ecf1e04373bb639119e6be0ea9dcfe120bae Mon Sep 17 00:00:00 2001
From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com>
Date: Mon, 3 Nov 2025 16:53:24 +0800
Subject: [PATCH 03/26] [https://nvbugs/5461796][fix] Unwaive test
 test_llmapi_speculative_decoding_mtp (#8832)

Signed-off-by: qgai <qgai@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index da7ccc01cea..1856f8dbd5d 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -227,7 +227,6 @@ triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (http
 triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624)
 triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343)
 triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482)
-llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796)
 accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525)
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] SKIP (https://nvbugs/5465143)
 examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKIP (https://nvbugs/5465143)

From 0db9f3c44943c50d74137034f1c13011d987c2d1 Mon Sep 17 00:00:00 2001
From: brb-nv <169953907+brb-nv@users.noreply.github.com>
Date: Mon, 3 Nov 2025 14:49:36 -0800
Subject: [PATCH 04/26] [https://nvbugs/5521253][fix] Enable Gemma3 12B & 27B
 on SM100 (#8666)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 cpp/kernels/fmha_v2/setup.py                | 10 ++++++++++
 cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp |  2 +-
 tests/integration/defs/test_e2e.py          |  4 ++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
index 895b4a4592b..43a175ba803 100644
--- a/cpp/kernels/fmha_v2/setup.py
+++ b/cpp/kernels/fmha_v2/setup.py
@@ -6398,6 +6398,16 @@ def enumerate_kernels():
                   and kspec.cross_mha     == False
                   and kspec.flash_attention == True
                   and kspec.input_layout != InputLayout.SEPARATE_Q_K_V)
+                  # Gemma3 VL support.
+                  or  (kspec.sm           == 100
+                  and kspec.dtype         in ['fp16', 'bf16', 'fp16_fp32', 'e4m3', 'e4m3_fp32']
+                  and kspec.head_size     == 72
+                  and kspec.head_size_v   == 0
+                  and kspec.sage_block_sizes is None
+                  and kspec.version       == 2
+                  and kspec.cross_mha     == False
+                  and kspec.flash_attention == True
+                  and kspec.input_layout != InputLayout.SEPARATE_Q_K_V)
                   # Deepseek MLA (generation 576/512 paged)
                   or (kspec.sm            in [90, 100, 120]
                   and kspec.dtype         in ['bf16', 'e4m3_fp32']
diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
index 70327e9ca7a..11b3e1b0ffd 100644
--- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
+++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp
@@ -49,7 +49,7 @@ FmhaDispatcher::FmhaDispatcher(MHARunnerFixedParams fixedParams)
     // TRTLLM-GEN only supports power of 2 head sizes.
     // The exception will fall back to fmha v2.
     // Please update fmha_v2/setup.py if you want to add more supported head sizes.
-    , mUseTllmGen(tensorrt_llm::common::isSM100Family() && fixedParams.headSize != 80)
+    , mUseTllmGen(tensorrt_llm::common::isSM100Family() && fixedParams.headSize != 80 && fixedParams.headSize != 72)
 {
     if (mUseTllmGen)
     {
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 06ba812f921..16bea901095 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2497,6 +2497,10 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
                  "gemma/gemma-3-27b-it",
                  marks=(pytest.mark.skip_less_device_memory(80000),
                         skip_post_blackwell)),
+    pytest.param(
+        "Nano-v2-VLM",
+        "Nano-v2-VLM",
+        marks=pytest.mark.skip(reason="Nano V2 VLM ckpt is not released yet.")),
 ])
 def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
                                    modality, use_cuda_graph):

From 06f338b884331883f90fe4fbaebc82be9770c570 Mon Sep 17 00:00:00 2001
From: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
Date: Tue, 4 Nov 2025 18:12:02 +0800
Subject: [PATCH 05/26] [https://nvbugs/5606266][test] move qwen3 multi-node
 test to the qa list (#8908)

Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/test_lists/qa/llm_function_multinode.txt    | 1 +
 tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_lists/qa/llm_function_multinode.txt b/tests/integration/test_lists/qa/llm_function_multinode.txt
index f2e3f8d216a..8a3958cf33c 100644
--- a/tests/integration/test_lists/qa/llm_function_multinode.txt
+++ b/tests/integration/test_lists/qa/llm_function_multinode.txt
@@ -11,3 +11,4 @@ test_e2e.py::test_multi_nodes_eval[Kimi-K2-Instruct-tp16-mmlu]
 test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu]
 test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp2pp1-gen_tp2pp1]
 test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2]
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180)
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
index eb618093379..57c3b6fd810 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@@ -39,4 +39,3 @@ l0_gb200_multi_nodes:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90)
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180)

From 883d95cab6ba0bc901330692e3a6444c6372bed5 Mon Sep 17 00:00:00 2001
From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com>
Date: Wed, 5 Nov 2025 01:34:26 +0800
Subject: [PATCH 06/26] [https://nvbugs/5569754][chore] Adjust max batch size
 to prevent OOM (#8876)

Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 examples/llm-api/llm_mgmn_llm_distributed.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/llm-api/llm_mgmn_llm_distributed.sh b/examples/llm-api/llm_mgmn_llm_distributed.sh
index bc6b6e16a62..fa23ae2548c 100644
--- a/examples/llm-api/llm_mgmn_llm_distributed.sh
+++ b/examples/llm-api/llm_mgmn_llm_distributed.sh
@@ -51,5 +51,6 @@ srun -l \
         trtllm-llmapi-launch python3 $script \
             --model_dir $LOCAL_MODEL \
             --prompt 'Hello, how are you?' \
-            --tp_size 2
+            --tp_size 2 \
+            --max_batch_size 256
     "

From cb36f3e14fdfd3ae670613377cbddd1a53dd8e6f Mon Sep 17 00:00:00 2001
From: Simeng Liu <109828133+SimengLiu-nv@users.noreply.github.com>
Date: Tue, 4 Nov 2025 14:11:48 -0800
Subject: [PATCH 07/26] [https://nvbugs/5606136][fix] Fix torch.onnx.export
 with pytorch upgrade to fallback to dynamo=False. (#8917)

Signed-off-by: Simeng Liu <simengl@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 .../models/contrib/dit/vae_decoder_trt.py     | 21 +++++++++++--------
 examples/models/core/qwenvl/vit_onnx_trt.py   |  3 ++-
 tensorrt_llm/tools/multimodal_builder.py      | 17 ++++++++-------
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/examples/models/contrib/dit/vae_decoder_trt.py b/examples/models/contrib/dit/vae_decoder_trt.py
index 1374dbcbfa6..31803a6690d 100644
--- a/examples/models/contrib/dit/vae_decoder_trt.py
+++ b/examples/models/contrib/dit/vae_decoder_trt.py
@@ -34,15 +34,18 @@ def export_onnx(self, onnxFile):
                              *self.latent_shape).cuda()
         self.pytorch_model.cuda().eval()
         with torch.inference_mode():
-            torch.onnx.export(self.pytorch_model,
-                              latent,
-                              onnxFile,
-                              opset_version=17,
-                              input_names=['input'],
-                              output_names=['output'],
-                              dynamic_axes={'input': {
-                                  0: 'batch'
-                              }})
+            torch.onnx.export(
+                self.pytorch_model,
+                latent,
+                onnxFile,
+                opset_version=17,
+                input_names=['input'],
+                output_names=['output'],
+                dynamic_axes={'input': {
+                    0: 'batch'
+                }},
+                # Required for pytorch>=2.9.0 as dynamo becomes the default and introduces bugs as it does not support opset_version=17 natively
+                dynamo=False)
 
     def generate_trt_engine(self, onnxFile, planFile):
         print(f"Start exporting TRT model to {planFile}!")
diff --git a/examples/models/core/qwenvl/vit_onnx_trt.py b/examples/models/core/qwenvl/vit_onnx_trt.py
index b667224a6b8..ba21fc93ef0 100644
--- a/examples/models/core/qwenvl/vit_onnx_trt.py
+++ b/examples/models/core/qwenvl/vit_onnx_trt.py
@@ -89,7 +89,8 @@ def export_onnx(self, onnx_file_path, pretrained_model_path, image_url):
             dynamic_axes={"input": {
                 0: "batch"
             }},
-        )
+            # Required for pytorch>=2.9.0 as dynamo becomes the default and introduces bugs as it does not support opset_version=17 natively
+            dynamo=False)
         release_gc()  # Further release memory
         print(
             f"Export to ONNX file successfully! The ONNX file stays in {onnx_file_path}"
diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py
index ea643475fd1..3906fbe2747 100644
--- a/tensorrt_llm/tools/multimodal_builder.py
+++ b/tensorrt_llm/tools/multimodal_builder.py
@@ -163,13 +163,16 @@ def export_onnx(model,
     logger.log(trt.Logger.INFO, f"Exporting onnx to {onnx_dir}/{onnx_name}")
     os.makedirs(onnx_dir, exist_ok=True)
 
-    torch.onnx.export(model,
-                      input,
-                      f'{onnx_dir}/{onnx_name}',
-                      opset_version=17,
-                      input_names=input_names,
-                      output_names=output_names,
-                      dynamic_axes=dynamic_axes)
+    torch.onnx.export(
+        model,
+        input,
+        f'{onnx_dir}/{onnx_name}',
+        opset_version=17,
+        input_names=input_names,
+        output_names=output_names,
+        dynamic_axes=dynamic_axes,
+        # Required for pytorch>=2.9.0 as dynamo becomes the default and introduces bugs as it does not support opset_version=17 natively
+        dynamo=False)
 
 
 def build_trt_engine(model_type,

From 27f582b7d541f15c0f39cbc5a8055d0f4cc9d23d Mon Sep 17 00:00:00 2001
From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com>
Date: Wed, 5 Nov 2025 10:17:01 +0800
Subject: [PATCH 08/26] =?UTF-8?q?[https://nvbugs/5634220][fix]=20Add=20dev?=
 =?UTF-8?q?eloper=20guide=20back=20and=20fix=20some=20i=E2=80=A6=20(#8911)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 docs/source/blogs/H100vsA100.md                              | 2 +-
 docs/source/blogs/H200launch.md                              | 2 +-
 .../tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md | 2 +-
 docs/source/features/disagg-serving.md                       | 2 +-
 docs/source/index.rst                                        | 1 +
 examples/llm-api/extra-llm-api-config.yml                    | 5 +++++
 examples/models/core/multimodal/README.md                    | 2 +-
 examples/sample_weight_stripping/README.md                   | 2 +-
 8 files changed, 12 insertions(+), 6 deletions(-)
 create mode 100644 examples/llm-api/extra-llm-api-config.yml

diff --git a/docs/source/blogs/H100vsA100.md b/docs/source/blogs/H100vsA100.md
index 06edd816202..9359863b548 100644
--- a/docs/source/blogs/H100vsA100.md
+++ b/docs/source/blogs/H100vsA100.md
@@ -28,7 +28,7 @@ TensorRT LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x
 
 <sub>FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT LLM v0.5.0., TensorRT 9.1</sub>
 
-The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html)
+The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/0.21.0/performance/perf-overview.html)
 
 Stay tuned for a highlight on Llama coming soon!
 
diff --git a/docs/source/blogs/H200launch.md b/docs/source/blogs/H200launch.md
index 6fd0737c33d..39463990368 100644
--- a/docs/source/blogs/H200launch.md
+++ b/docs/source/blogs/H200launch.md
@@ -21,7 +21,7 @@ TensorRT LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news
 
 <sup>*(1) Largest batch supported on given TP configuration by power of 2.*</sup> <sup>*(2) TP = Tensor Parallelism*</sup>
 
-Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html).
+Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/0.21.0/performance/perf-overview.html).
 
 ### H200 vs H100
 
diff --git a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
index f0d7647d001..fef8dcc93a2 100644
--- a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
+++ b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md
@@ -124,7 +124,7 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces
 
 Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments.
 
-For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
+For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/backends/trtllm/README.html).
 
 ### Triton Inference Server
 
diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md
index cbeea3cc503..ce52b9a3d5e 100644
--- a/docs/source/features/disagg-serving.md
+++ b/docs/source/features/disagg-serving.md
@@ -94,7 +94,7 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces
 
 Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments.
 
-For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
+For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/backends/trtllm/README.html).
 
 ### trtllm-serve
 
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 58ef3c76df9..7540b4c96a1 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -77,6 +77,7 @@ Welcome to TensorRT LLM's Documentation!
    features/ray-orchestrator.md
    features/torch_compile_and_piecewise_cuda_graph.md
 
+
 .. toctree::
    :maxdepth: 2
    :caption: Developer Guide
diff --git a/examples/llm-api/extra-llm-api-config.yml b/examples/llm-api/extra-llm-api-config.yml
new file mode 100644
index 00000000000..120cfea82e4
--- /dev/null
+++ b/examples/llm-api/extra-llm-api-config.yml
@@ -0,0 +1,5 @@
+cuda_graph_config:
+    enable_padding: True
+    max_batch_size: 16
+moe_config:
+    backend: trtllm
diff --git a/examples/models/core/multimodal/README.md b/examples/models/core/multimodal/README.md
index d001424bfc8..d92ec168bb8 100644
--- a/examples/models/core/multimodal/README.md
+++ b/examples/models/core/multimodal/README.md
@@ -901,7 +901,7 @@ Note that for instruct Vision model, please set the `max_encoder_input_len` as `
 
 ## NeVA
 
-[NeVA](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/multimodal/mllm/neva.html) is a groundbreaking addition to the NeMo Multimodal ecosystem. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM.
+[NeVA](https://docs.nvidia.com/nemo-framework/user-guide/latest/vlms/neva.html) is a groundbreaking addition to the NeMo Multimodal ecosystem. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM.
 
 1. Generate TRT-LLM engine for NVGPT following example in `examples/models/core/gpt/README.md`. To adhere to the NVGPT conventions of the conversion script, some layer keys have to be remapped using `--nemo_rename_key`.
 
diff --git a/examples/sample_weight_stripping/README.md b/examples/sample_weight_stripping/README.md
index a427dd3df45..dcc7d754f79 100644
--- a/examples/sample_weight_stripping/README.md
+++ b/examples/sample_weight_stripping/README.md
@@ -241,7 +241,7 @@ python3 ../summarize.py --engine_dir engines/llama2-70b-hf-fp8-tp2.refit \
 
 ## Prototype
 ### Checkpoint Pruner
-The checkpoint pruner allows you to strip `Conv` and `Gemm` weights out of a TensorRT LLM [checkpoint](https://nvidia.github.io/TensorRT-LLM/latest/architecture/checkpoint.html). Since these make up the vast majority of weights, the pruner will decrease the size of your checkpoint up to 99%.
+The checkpoint pruner allows you to strip `Conv` and `Gemm` weights out of a TensorRT LLM [checkpoint](https://nvidia.github.io/TensorRT-LLM/0.21.0/architecture/checkpoint.html). Since these make up the vast majority of weights, the pruner will decrease the size of your checkpoint up to 99%.
 
 When building an engine with a pruned checkpoint, TensorRT LLM fills in the missing weights with random ones. These weights should later be [refit](#engine-refitter) with the original weights to preserve the intended behavior.
 

From 852ca27c1351344686ac1f635c1b9dacdda45590 Mon Sep 17 00:00:00 2001
From: Jin Li <59594262+liji-nv@users.noreply.github.com>
Date: Thu, 6 Nov 2025 11:40:50 +0800
Subject: [PATCH 09/26] [https://nvbugs/5467531][fix] Fix moe test and wide ep
 fake impl (#8883)

Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 .../modules/fused_moe/fused_moe_wide_ep.py    | 27 ++++++++
 .../unittest/_torch/modules/test_fused_moe.py | 68 +++++++++----------
 2 files changed, 61 insertions(+), 34 deletions(-)

diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
index b46e96ddf7d..b06bc6bd125 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
@@ -960,3 +960,30 @@ def load_weights(self,
 
     def post_load_weights(self):
         self.quant_method.post_load_weights(self)
+
+    def forward_fake(
+        self,
+        x: Union[torch.Tensor, Fp4QuantizedTensor],
+        router_logits: torch.Tensor,
+        *,
+        do_finalize: bool = True,
+        output_dtype: Optional[torch.dtype] = None,
+        all_rank_num_tokens: Optional[List[int]] = None,
+        use_dp_padding: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[torch.Tensor, List[torch.Tensor]]:
+        moe_output = super().forward_fake(
+            x,
+            router_logits,
+            do_finalize=do_finalize,
+            output_dtype=torch.bfloat16,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=use_dp_padding,
+            **kwargs)
+        if self.alltoall_method_type == AlltoallMethodType.MNNVL:
+            shape = moe_output.shape
+            top_k = self.routing_method.experts_per_token
+            new_shape = [shape[0], top_k, shape[1]]
+            return moe_output.new_empty(new_shape)
+        else:
+            return moe_output
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 65fb89cf608..291daf110f1 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -334,7 +334,7 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type):
 
     world_size = 4
     dtype = torch.bfloat16
-    HIDDEN_SIZE = 2560
+    HIDDEN_SIZE = 4096
     INTERMEDIATE_SIZE = 1536
     NUM_EXPERTS = 72
     TOP_K = 6
@@ -350,8 +350,8 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type):
         x_list = []
         m = MAX_NUM_TOKENS
         while m >= 1:
-            x = torch.randn((m, HIDDEN_SIZE), dtype=dtype, device="cuda")
-            x_list.append(x.cuda(i))
+            x = torch.randn((m, HIDDEN_SIZE), dtype=dtype)
+            x_list.append(x)
             m //= 2
 
         x_abs_max = torch.cat([x.flatten() for x in x_list]).abs().max().float()
@@ -396,40 +396,28 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type):
             w3_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse(
                 w3_sf_block.cpu().view(INTERMEDIATE_SIZE, -1))
 
-            w1_input_scale = x_sf_global.cuda(i)
-            w2_input_scale = x_sf_global.cuda(i)
-            w3_input_scale = x_sf_global.cuda(i)
+            weights[f"{expert_id}.w1.weight"] = w1_weight_nvfp4.cpu()
+            weights[f"{expert_id}.w2.weight"] = w2_weight_nvfp4.cpu()
+            weights[f"{expert_id}.w3.weight"] = w3_weight_nvfp4.cpu()
+            weights[f"{expert_id}.w1.weight_scale"] = w1_sf_block_unswizzled
+            weights[f"{expert_id}.w2.weight_scale"] = w2_sf_block_unswizzled
+            weights[f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled
 
-            weights[f"{expert_id}.w1.weight"] = w1_weight_nvfp4.cuda(i)
-            weights[f"{expert_id}.w2.weight"] = w2_weight_nvfp4.cuda(i)
-            weights[f"{expert_id}.w3.weight"] = w3_weight_nvfp4.cuda(i)
-            weights[
-                f"{expert_id}.w1.weight_scale"] = w1_sf_block_unswizzled.cuda(i)
-            weights[
-                f"{expert_id}.w2.weight_scale"] = w2_sf_block_unswizzled.cuda(i)
-            weights[
-                f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled.cuda(i)
-
-            weights[f"{expert_id}.w1.input_scale"] = 1.0 / w1_input_scale.cuda(
-                i)
-            weights[f"{expert_id}.w2.input_scale"] = 1.0 / w2_input_scale.cuda(
-                i)
-            weights[f"{expert_id}.w3.input_scale"] = 1.0 / w3_input_scale.cuda(
-                i)
-            weights[f"{expert_id}.w1.weight_scale_2"] = 1.0 / w3_w1_global.cuda(
-                i)
-            weights[f"{expert_id}.w2.weight_scale_2"] = 1.0 / w2_sf_global.cuda(
-                i)
-            weights[f"{expert_id}.w3.weight_scale_2"] = 1.0 / w3_w1_global.cuda(
-                i)
+            weights[f"{expert_id}.w1.input_scale"] = 1.0 / x_sf_global
+            weights[f"{expert_id}.w2.input_scale"] = 1.0 / x_sf_global
+            weights[f"{expert_id}.w3.input_scale"] = 1.0 / x_sf_global
+            weights[f"{expert_id}.w1.weight_scale_2"] = 1.0 / w3_w1_global.cpu()
+            weights[f"{expert_id}.w2.weight_scale_2"] = 1.0 / w2_sf_global.cpu()
+            weights[f"{expert_id}.w3.weight_scale_2"] = 1.0 / w3_w1_global.cpu()
 
         x_list_world.append(x_list)
         weights_world.append(weights)
+        torch.cuda.synchronize()
 
-    def per_rank_test_fused_moe_alltoall(job_id):
+    def per_rank_test_fused_moe_alltoall(job_id, weights, x_list):
         routing_method = DefaultMoeRoutingMethod(top_k=TOP_K)
         mapping = Mapping(world_size=world_size,
-                          rank=mpi_rank(),
+                          rank=job_id,
                           tp_size=world_size,
                           moe_ep_size=world_size,
                           moe_tp_size=1,
@@ -437,8 +425,8 @@ def per_rank_test_fused_moe_alltoall(job_id):
         torch.cuda.set_device(mapping.rank)
         torch.manual_seed(mapping.rank)
 
-        x_list = x_list_world[mapping.rank]
-        weights = weights_world[mapping.rank]
+        weights = {k: v.cuda() for k, v in weights.items()}
+        x_list = [x.cuda() for x in x_list]
 
         quant_config = QuantConfig(quant_algo=QuantAlgo.NVFP4)
         with mock.patch.object(WideEPMoE,
@@ -489,6 +477,16 @@ def per_rank_test_fused_moe_alltoall(job_id):
                     router_logits,
                     all_rank_num_tokens=all_rank_num_tokens,
                     use_dp_padding=False)
+                # Verify the fake impl is correct.
+                output_fake = alltoall_model.forward_fake(
+                    x,
+                    router_logits,
+                    all_rank_num_tokens=all_rank_num_tokens,
+                    use_dp_padding=False)
+                assert output_fake.shape == output.shape
+                assert output_fake.dtype == output.dtype
+                if len(output.shape) == 3:
+                    output = torch.sum(output, dim=1, keepdim=False)
                 ref_output = ref_model.forward(
                     x,
                     router_logits,
@@ -500,8 +498,10 @@ def per_rank_test_fused_moe_alltoall(job_id):
             m //= 2
 
     with MPIPoolExecutor(max_workers=world_size) as executor:
-        results = executor.map(per_rank_test_fused_moe_alltoall,
-                               range(world_size))
+        results = executor.map(
+            per_rank_test_fused_moe_alltoall,
+            *zip(*[(i, weights_world[i], x_list_world[i])
+                   for i in range(world_size)]))
         for r in results:
             assert r is None
 

From 0a32f26ff8ae9b3ca15d02afebc722f4339e8319 Mon Sep 17 00:00:00 2001
From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Date: Fri, 7 Nov 2025 15:13:29 +0800
Subject: [PATCH 10/26] [https://nvbugs/5636946][fix] Update test model (#8993)

Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     | 133 ++++++++++++++++++
 .../test_lists/qa/llm_function_rtx6k.txt      |   2 +-
 2 files changed, 134 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index e4895ee9e0f..4d3b39a7b2d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2186,6 +2186,139 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
             # task.evaluate(llm,
             #               extra_evaluator_kwargs=dict(apply_chat_template=True))
 
+    @skip_pre_blackwell
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
+        [
+            #  Use a larger batch_size to speed up the tests
+            pytest.param(8,
+                         1,
+                         4,
+                         3,
+                         False,
+                         False,
+                         True,
+                         True,
+                         32,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(8,
+                         1,
+                         4,
+                         3,
+                         False,
+                         False,
+                         True,
+                         True,
+                         32,
+                         "TRTLLM",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(8,
+                         1,
+                         8,
+                         0,
+                         True,
+                         True,
+                         True,
+                         True,
+                         32,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(8,
+                         1,
+                         1,
+                         0,
+                         True,
+                         True,
+                         True,
+                         True,
+                         32,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(4,
+                         1,
+                         1,
+                         0,
+                         True,
+                         True,
+                         True,
+                         True,
+                         16,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(4)),
+            pytest.param(8,
+                         1,
+                         8,
+                         1,
+                         True,
+                         True,
+                         True,
+                         True,
+                         32,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+            pytest.param(8,
+                         1,
+                         8,
+                         1,
+                         True,
+                         True,
+                         True,
+                         True,
+                         8,
+                         "CUTLASS",
+                         marks=pytest.mark.skip_less_mpi_world_size(8)),
+        ],
+        ids=[
+            "latency", "latency_trtllmgen", "throughput", "throughput_tp8",
+            "throughput_tp4", "throughput_mtp", "throughput_bs8_mtp"
+        ])
+    def test_nvfp4_multi_gpus_sm120(self, tp_size, pp_size, ep_size, mtp_nextn,
+                                    fp8kv, attention_dp, cuda_graph,
+                                    overlap_scheduler, max_batch_size,
+                                    moe_backend):
+        if moe_backend == "TRTLLM" and (get_sm_version() == 120
+                                        or get_sm_version() == 121):
+            pytest.skip(
+                "MOE TRTLLM backend does not support SM version 120 or 121")
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70)
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
+
+        if fp8kv:
+            kv_cache_config.dtype = "fp8"
+
+        mtp_config = None
+        if mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2",
+                 max_batch_size=max_batch_size,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
+
+            assert llm.args.moe_config.backend == moe_backend
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            # This covers the case with relatively large seqlen in the generation phase.
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            # Commented out because GPQA takes too long to run
+            # task = GPQADiamond(self.MODEL_NAME)
+            # task.evaluate(llm,
+            #               extra_evaluator_kwargs=dict(apply_chat_template=True))
+
     @skip_pre_blackwell
     @pytest.mark.parametrize(
         "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend",
diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt
index ed257078678..88b1e1ebd14 100644
--- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt
+++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt
@@ -30,7 +30,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtl
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8]
 test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]

From 9fe24d60b05885ab5c2d0e198fdcd1c90bae5722 Mon Sep 17 00:00:00 2001
From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com>
Date: Sat, 8 Nov 2025 00:27:12 +0800
Subject: [PATCH 11/26] [None][doc] Replace the relative links with absolute
 links in README.md. (#8997)

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4bcab9d94b6..4c6745dc5b2 100644
--- a/README.md
+++ b/README.md
@@ -230,7 +230,7 @@ Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co
 
 TensorRT LLM is an open-sourced library for optimizing Large Language Model (LLM) inference. It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, [FP4](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/), INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ...), speculative decoding, and much more, to perform inference efficiently on NVIDIA GPUs.
 
-[Architected on PyTorch](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/torch/arch_overview.md), TensorRT LLM provides a high-level Python [LLM API](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server).
+[Architected on PyTorch](https://github.com/NVIDIA/TensorRT-LLM/blob/release/1.1/docs/source/developer-guide/overview.md), TensorRT LLM provides a high-level Python [LLM API](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server).
 
 TensorRT LLM is designed to be modular and easy to modify. Its PyTorch-native architecture allows developers to experiment with the runtime or extend functionality. Several popular models are also pre-defined and can be customized using [native PyTorch code](./tensorrt_llm/_torch/models/modeling_deepseekv3.py), making it easy to adapt the system to specific needs.
 

From 88a398ce23d9b3b32381b55cc0a538d97a1c9d7c Mon Sep 17 00:00:00 2001
From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com>
Date: Sat, 8 Nov 2025 02:14:00 +0800
Subject: [PATCH 12/26] [https://nvbugs/5575920][fix] Fix cublas/cublasLt
 handle creation memory not sufficient error (#8900)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 cpp/tensorrt_llm/common/opUtils.cpp           | 78 +++++++++++++++++--
 .../defs/accuracy/test_llm_api_pytorch.py     | 12 ++-
 2 files changed, 82 insertions(+), 8 deletions(-)

diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp
index ae3810a255f..736cd1c48d0 100644
--- a/cpp/tensorrt_llm/common/opUtils.cpp
+++ b/cpp/tensorrt_llm/common/opUtils.cpp
@@ -252,6 +252,46 @@ class PerCudaCtxPerThreadSingletonCreator
     std::unordered_map<CacheKey, std::weak_ptr<T>, hash<CacheKey>>* mObservers;
 };
 
+// Structure to hold memory information
+struct MemoryInfo
+{
+    size_t free_mb;
+    size_t total_mb;
+    float free_percent;
+};
+
+// Helper function to get current memory information
+MemoryInfo getMemoryInfo()
+{
+    size_t free_mem = 0, total_mem = 0;
+    TLLM_CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem));
+
+    size_t const free_mb = free_mem / (1024 * 1024);
+    size_t const total_mb = total_mem / (1024 * 1024);
+    float const free_percent = (total_mem > 0) ? (static_cast<float>(free_mem) / total_mem * 100.0f) : 0.0f;
+
+    return {free_mb, total_mb, free_percent};
+}
+
+// Helper function to log current memory usage
+void logMemoryUsage(char const* operation, CUcontext ctx)
+{
+    auto const mem = getMemoryInfo();
+    TLLM_LOG_DEBUG("%s: Context=%p, Free Memory=%zu MB (%.1f%%), Total=%zu MB", operation, ctx, mem.free_mb,
+        mem.free_percent, mem.total_mb);
+}
+
+// Helper function to throw
+void throwCublasErrorWithMemInfo(char const* operation, CUcontext ctx, cublasStatus_t status)
+{
+    auto const mem = getMemoryInfo();
+    TLLM_THROW(
+        "Failed to create %s. "
+        "Status: %d, Context: %p, Free Memory: %zu MB (%.1f%%), Total: %zu MB. "
+        "Consider reducing kv_cache_config.free_gpu_memory_fraction.",
+        operation, status, ctx, mem.free_mb, mem.free_percent, mem.total_mb);
+}
+
 } // namespace
 
 std::shared_ptr<cublasHandle_t> getCublasHandle()
@@ -259,13 +299,26 @@ std::shared_ptr<cublasHandle_t> getCublasHandle()
     static PerCudaCtxPerThreadSingletonCreator<cublasHandle_t> creator(
         []() -> auto
         {
-            auto handle = std::unique_ptr<cublasHandle_t>(new cublasHandle_t);
-            TLLM_CUDA_CHECK(cublasCreate(handle.get()));
+            CUcontext ctx = getCurrentCudaCtx();
+            logMemoryUsage("Creating cublas handle", ctx);
+
+            auto handle = std::make_unique<cublasHandle_t>();
+            auto status = cublasCreate(handle.get());
+
+            if (status != CUBLAS_STATUS_SUCCESS)
+            {
+                throwCublasErrorWithMemInfo("cublas handle", ctx, status);
+            }
+
             return handle;
         },
         [](cublasHandle_t* handle)
         {
-            TLLM_CUDA_CHECK(cublasDestroy(*handle));
+            auto status = cublasDestroy(*handle);
+            if (status != CUBLAS_STATUS_SUCCESS)
+            {
+                TLLM_LOG_WARNING("Failed to destroy cublas handle. Status: %d", status);
+            }
             delete handle;
             handle = nullptr;
         });
@@ -277,13 +330,26 @@ std::shared_ptr<cublasLtHandle_t> getCublasLtHandle()
     static PerCudaCtxPerThreadSingletonCreator<cublasLtHandle_t> creator(
         []() -> auto
         {
-            auto handle = std::unique_ptr<cublasLtHandle_t>(new cublasLtHandle_t);
-            TLLM_CUDA_CHECK(cublasLtCreate(handle.get()));
+            CUcontext ctx = getCurrentCudaCtx();
+            logMemoryUsage("Creating cublasLt handle", ctx);
+
+            auto handle = std::make_unique<cublasLtHandle_t>();
+            auto status = cublasLtCreate(handle.get());
+
+            if (status != CUBLAS_STATUS_SUCCESS)
+            {
+                throwCublasErrorWithMemInfo("cublasLt handle", ctx, status);
+            }
+
             return handle;
         },
         [](cublasLtHandle_t* handle)
         {
-            TLLM_CUDA_CHECK(cublasLtDestroy(*handle));
+            auto status = cublasLtDestroy(*handle);
+            if (status != CUBLAS_STATUS_SUCCESS)
+            {
+                TLLM_LOG_WARNING("Failed to destroy cublasLt handle. Status: %d", status);
+            }
             delete handle;
             handle = nullptr;
         });
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 4d3b39a7b2d..2bd4707a479 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -164,7 +164,11 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile):
             disable_overlap_scheduler=torch_compile,
         )
         if fp8kv:
-            pytorch_config["kv_cache_config"] = KvCacheConfig(dtype="fp8")
+            pytorch_config["kv_cache_config"] = KvCacheConfig(
+                dtype="fp8",
+                free_gpu_memory_fraction=
+                0.8,  # Prevent cublas/cublasLt handle allocation memory insufficient errors
+            )
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
                 **pytorch_config) as llm:
@@ -199,7 +203,11 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend,
             disable_overlap_scheduler=torch_compile,
         )
         if fp8kv:
-            pytorch_config["kv_cache_config"] = KvCacheConfig(dtype="fp8")
+            pytorch_config["kv_cache_config"] = KvCacheConfig(
+                dtype="fp8",
+                free_gpu_memory_fraction=
+                0.8,  # Prevent cublas/cublasLt handle allocation memory insufficient errors
+            )
         with LLM(
                 f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
                 tensor_parallel_size=tp_size,

From aa11be1185d5b6d432484e390ab5ee79533d6cdb Mon Sep 17 00:00:00 2001
From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com>
Date: Mon, 10 Nov 2025 13:44:16 +0800
Subject: [PATCH 13/26] =?UTF-8?q?[TRTLLM-9073][doc]=20Add=20the=20missing?=
 =?UTF-8?q?=20content=20for=20model=20support=20section=20and=20fix?=
 =?UTF-8?q?=E2=80=A6=20(#9033)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 docs/source/overview.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/overview.md b/docs/source/overview.md
index fe44002b167..c132756d551 100644
--- a/docs/source/overview.md
+++ b/docs/source/overview.md
@@ -23,7 +23,7 @@ TensorRT LLM delivers breakthrough performance on the latest NVIDIA GPUs:
 
 ### 🎯 **Comprehensive Model Support**
 
-TensorRT LLM supports the latest and most popular LLM architectures:
+TensorRT LLM supports the latest and most popular LLM [architectures](https://nvidia.github.io/TensorRT-LLM/models/supported-models.html).
 
 - **Language Models**: GPT-OSS, Deepseek-R1/V3, Llama 3/4, Qwen2/3, Gemma 3, Phi 4...
 - **Multi-modal Models**: LLaVA-NeXT, Qwen2-VL, VILA, Llama 3.2 Vision...

From b10e908a1cc75ab0e0c825ad3af25d24c2d0fa04 Mon Sep 17 00:00:00 2001
From: Vincent Zhang <vcheungyi@163.com>
Date: Tue, 11 Nov 2025 13:00:47 +0800
Subject: [PATCH 14/26] [https://nvbugs/5284463][fix] fix ada fp8 group gemm
 lacks shared memory (#9044)

Signed-off-by: Vincent Zhang <vinczhang@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 .../kernels/cutlass_kernels/cutlass_heuristic.cpp           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
index 7c81a5d7b56..1283d8936e4 100644
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp
@@ -177,13 +177,13 @@ std::vector<CutlassTileConfig> get_candidate_tiles(
         {
             if (sm == 89 || sm >= 120)
             {
-                return {CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128,
-                    CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
+                return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64,
                     CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64,
                     CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64,
                     CutlassTileConfig::CtaShape128x64x64_WarpShape64x32x64,
                     CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64,
-                    CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64};
+                    CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64,
+                    CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128};
             }
             else
             {

From eeb69fc1cb284aa25022f49acd43cd15188cfe18 Mon Sep 17 00:00:00 2001
From: peaceh-nv <103117813+peaceh-nv@users.noreply.github.com>
Date: Tue, 11 Nov 2025 15:42:08 +0800
Subject: [PATCH 15/26] [https://nvbugs/5570575][fix] : Use less kv cache
 memory on SM120 (#9054)

Signed-off-by: peaceh <103117813+peaceh-nv@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/defs/test_e2e.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 16bea901095..679705bb283 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2466,6 +2466,7 @@ def test_ptp_quickstart_advanced_2gpus_sm120(llm_root, llm_venv, model_name,
         f"{llm_models_root()}/{model_path}",
         "--tp_size=2",
         "--max_num_tokens=256",
+        f"--kv_cache_fraction={_MEM_FRACTION_50}",
     ])
 
 

From 62c2d989107f6ceed61424f22e56c450454d85c7 Mon Sep 17 00:00:00 2001
From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Date: Wed, 12 Nov 2025 16:08:05 +0800
Subject: [PATCH 16/26] [https://nvbugs/5628952][fix] avoid cudaFree overlap
 with cuda graph (#8903)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/test_lists/waives.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 1856f8dbd5d..e201ad66729 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -351,7 +351,6 @@ unittest/bindings/test_hostfunc.py::test_hostfunc SKIP (https://nvbugs/5643631)
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
 accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
 accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
-accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824)
 unittest/_torch/modules SKIP (https://nvbugs/5637037)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854)
 accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854)

From e1bcf6b3a6207e2919bc3126aef29a7795248cff Mon Sep 17 00:00:00 2001
From: Michal Guzek <moraxu@users.noreply.github.com>
Date: Thu, 13 Nov 2025 05:17:20 -0800
Subject: [PATCH 17/26] [https://nvbugs/5628204][fix] Stop token IDs - fast
 path optimization for single stop token IDs only (#9014)

Signed-off-by: Michal Guzek <mguzek@nvidia.com>
Signed-off-by: Michal Guzek <moraxu@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tensorrt_llm/_torch/pyexecutor/sampler.py     | 16 +++-
 .../integration/test_lists/test-db/l0_a10.yml |  1 +
 .../_torch/sampler/test_trtllm_sampler.py     | 84 ++++++++++++++++++-
 3 files changed, 96 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
index 99ad3453b7e..c2c4032d785 100644
--- a/tensorrt_llm/_torch/pyexecutor/sampler.py
+++ b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -698,12 +698,24 @@ def _meet_max_token_stop_criteria(request: LlmRequest, max_seq_len: int):
         )
 
     @staticmethod
-    def _meet_stop_token_criteria(request: LlmRequest):
+    def _meet_stop_token_criteria(request: LlmRequest, new_token: int):
         if request.py_stop_words_list:
             assert isinstance(request.py_stop_words_list, list), (
                 "request.py_stop_words_list should be a list"
             )
             stop_words_list, prefix_sum = request.py_stop_words_list
+
+            # Determine max stop word length to decide optimization path
+            max_stop_word_length = prefix_sum[0] if prefix_sum else 0
+            for i in range(1, len(prefix_sum)):
+                word_length = prefix_sum[i] - prefix_sum[i - 1]
+                max_stop_word_length = max(max_stop_word_length, word_length)
+
+            # Fast path: all stop words are single tokens
+            if max_stop_word_length == 1:
+                return new_token in stop_words_list
+
+            # Slow path: at least one multi-token stop word exists
             tokens = request.get_tokens(0)
             offset = 0
             for i, offset_end in enumerate(prefix_sum):
@@ -730,7 +742,7 @@ def _handle_stop_criteria(
             request.finish_by(FinishReason.LENGTH, BEAM)
             return True
 
-        if cls._meet_stop_token_criteria(request):
+        if cls._meet_stop_token_criteria(request, new_token):
             request.finish_by(FinishReason.STOP_WORDS, BEAM)
             return True
 
diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml
index 3bca2c6eced..1b79caaec78 100644
--- a/tests/integration/test_lists/test-db/l0_a10.yml
+++ b/tests/integration/test_lists/test-db/l0_a10.yml
@@ -19,6 +19,7 @@ l0_a10:
   - unittest/utils/test_util.py
   - unittest/_torch/modeling/test_modeling_mistral.py
   - unittest/_torch/modeling/test_modeling_pixtral.py
+  - unittest/_torch/sampler/test_trtllm_sampler.py
   # NOTE: this is a CPU-only test, but we do not have a dedicated job for this (and therefore no
   # test list either).
   - unittest/_torch/models/checkpoints/hf/test_weight_loader.py
diff --git a/tests/unittest/_torch/sampler/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
index dec50239c13..355ab4cce73 100644
--- a/tests/unittest/_torch/sampler/test_trtllm_sampler.py
+++ b/tests/unittest/_torch/sampler/test_trtllm_sampler.py
@@ -12,8 +12,10 @@ def model_path():
     return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
 
 
-def create_llm(model_dir):
-    """Create LLM with specific overlap scheduler setting"""
+def _create_llm_base(model_dir, enable_trtllm_sampler):
+    """Base LLM creation with configurable sampler."""
+    sampler_type = "TRTLLMSampler" if enable_trtllm_sampler else "TorchSampler"
+
     trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
 
     return LLM(
@@ -22,13 +24,23 @@ def create_llm(model_dir):
         trust_remote_code=True,
         enable_chunked_prefill=True,
         cuda_graph_config=CudaGraphConfig(),
+        sampler_type=sampler_type,
         kv_cache_config=trt_kv_cache_config,
-        sampler_type="TRTLLMSampler",
         max_num_tokens=
         128  # Only one request longer than max_num_tokens is required to test chunked prefill
     )
 
 
+def create_llm(model_dir):
+    """Create LLM with specific overlap scheduler setting"""
+    return _create_llm_base(model_dir, enable_trtllm_sampler=True)
+
+
+def create_llm_with_torch_sampler(model_dir):
+    """Create LLM with TorchSampler."""
+    return _create_llm_base(model_dir, enable_trtllm_sampler=False)
+
+
 @pytest.mark.high_cuda_memory
 def test_trtllm_sampler(model_path):
     prompts = [
@@ -68,3 +80,69 @@ def test_trtllm_sampler(model_path):
     # Verify outputs are consistent
     for text, expected in zip(texts, expected_outputs):
         assert similar(text, expected), f"text: {text}, expected: {expected}"
+
+
+@pytest.mark.high_cuda_memory
+def test_trtllm_sampler_with_stop_token_ids(model_path):
+    """Test sampler with stop_token_ids (fast path optimization)."""
+
+    llm = create_llm_with_torch_sampler(model_path)
+    tokenizer = llm.tokenizer
+
+    prompt = "The capital of France is"
+    target_sentence = "The capital of France is Paris"
+
+    prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False)
+    target_tokens = tokenizer.encode(target_sentence, add_special_tokens=False)
+
+    # Use the first token after the prompt as the stop token
+    assert len(target_tokens) > len(
+        prompt_tokens), "Target must be longer than prompt"
+    stop_token_id = target_tokens[len(prompt_tokens)]
+
+    sampling_config = SamplingParams(max_tokens=100,
+                                     n=1,
+                                     stop_token_ids=[stop_token_id],
+                                     temperature=0.0)
+
+    outputs = llm.generate([prompt], sampling_params=sampling_config)
+    text = outputs[0].outputs[0].text
+
+    output_tokens = tokenizer.encode(text, add_special_tokens=False)
+
+    llm.shutdown()
+    assert stop_token_id not in output_tokens, f"Output should not contain stop token {stop_token_id}"
+    assert len(output_tokens
+               ) < 10, "Should stop very early with first-token stop_token_id"
+
+
+@pytest.mark.high_cuda_memory
+def test_torch_sampler_with_multi_token_stop_words(model_path):
+    """Test TorchSampler with multi-token stop words (slow path)."""
+
+    llm = create_llm_with_torch_sampler(model_path)
+    tokenizer = llm.tokenizer
+
+    prompt = "The capital of France is"
+
+    # Use a string that will tokenize to multiple tokens
+    stop_string = "\n\n"
+    stop_tokens = tokenizer.encode(stop_string, add_special_tokens=False)
+
+    assert len(
+        stop_tokens
+    ) > 1, f"Stop string should be multi-token, got {len(stop_tokens)} tokens"
+
+    sampling_config = SamplingParams(
+        max_tokens=100,
+        n=1,
+        stop=[stop_string],  # Use 'stop' parameter for multi-token
+        temperature=0.0)
+
+    outputs = llm.generate([prompt], sampling_params=sampling_config)
+    text = outputs[0].outputs[0].text
+
+    llm.shutdown()
+
+    assert len(text) > 0, "Should generate some text"
+    assert stop_string not in text, f"Stop string '{repr(stop_string)}' should not appear in the output"

From 02da938cf16c506a62c2eaa87c8621dbb7867814 Mon Sep 17 00:00:00 2001
From: Chang Liu <9713593+chang-l@users.noreply.github.com>
Date: Thu, 13 Nov 2025 14:58:14 -0800
Subject: [PATCH 18/26] [TRTLLM-7971][doc] Doc update for multimodal in v1.1
 (#9015)

Signed-off-by: Chang Liu (Enterprise Products) <9713593+chang-l@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 .../legacy/reference/multimodal-feature-support-matrix.md | 4 ++--
 docs/source/models/supported-models.md                    | 8 ++++----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/docs/source/legacy/reference/multimodal-feature-support-matrix.md b/docs/source/legacy/reference/multimodal-feature-support-matrix.md
index d0cf2372684..b6d99e24ca6 100644
--- a/docs/source/legacy/reference/multimodal-feature-support-matrix.md
+++ b/docs/source/legacy/reference/multimodal-feature-support-matrix.md
@@ -7,7 +7,7 @@
 | VILA               | Yes        | No                  | No             | No              |
 | LLaVA-NeXT         | Yes        | Yes                 | Yes            | Yes             |
 | Llama 4            | Yes        | Yes                 | No             | No              |
-| Mistral-Small-3.1  | Yes        | Yes                 | No             | No              |
-| Phi-4-multimodal   | Yes        | Yes                 | No             | No              |
+| Mistral-Small-3.1  | Yes        | Yes                 | Yes            | Yes             |
+| Phi-4-multimodal   | Yes        | Yes                 | Yes            | Yes             |
 | Qwen2-VL           | Yes        | Yes                 | Yes            | Yes             |
 | Qwen2.5-VL         | Yes        | Yes                 | Yes            | Yes             |
diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md
index 749cfcc21d9..c6b6194b5d0 100644
--- a/docs/source/models/supported-models.md
+++ b/docs/source/models/supported-models.md
@@ -50,13 +50,13 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl
 | `Gemma3ForConditionalGeneration`     | Yes               | Yes        | N/A             | Yes           | Yes              | N/A            | Yes                   | No                        | L + I     |
 | `HCXVisionForCausalLM`               | Yes               | Yes        | No              | Yes           | Yes              | Yes            | Yes                   | No                        | L + I     |
 | `LlavaLlamaModel (VILA)`             | Yes               | Yes        | No              | Yes           | Yes              | No             | Yes                   | No                        | L + I + V |
-| `LlavaNextForConditionalGeneration`  | Yes               | Yes        | No              | Yes           | Yes              | No             | Yes                   | No                        | L + I     |
+| `LlavaNextForConditionalGeneration`  | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | Yes                       | L + I     |
 | `Llama4ForConditionalGeneration`     | Yes               | Yes        | No              | Yes           | Yes              | No             | Yes                   | No                        | L + I     |
 | `Mistral3ForConditionalGeneration`   | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | No                        | L + I     |
-| `NemotronH_Nano_VL_V2`               | Yes               | Yes        | Yes             | Yes           | Yes              | No             | Yes                   | No                        | L + I + V |
+| `NemotronH_Nano_VL_V2`               | Yes               | Yes        | Yes             | Yes           | Yes              | N/A            | Yes                   | No                        | L + I + V |
 | `Phi4MMForCausalLM`                  | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | No                        | L + I + A |
-| `Qwen2VLForConditionalGeneration`    | Yes               | Yes        | No              | Yes           | Yes              | Yes            | Yes                   | No                        | L + I + V |
-| `Qwen2_5_VLForConditionalGeneration` | Yes               | Yes        | No              | Yes           | Yes              | Yes            | Yes                   | No                        | L + I + V |
+| `Qwen2VLForConditionalGeneration`    | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | No                        | L + I + V |
+| `Qwen2_5_VLForConditionalGeneration` | Yes               | Yes        | Yes             | Yes           | Yes              | Yes            | Yes                   | No                        | L + I + V |
 
 Note:
 - L: Language

From 2cfd34f5286254d1f27b13fb1859b74f84543ea9 Mon Sep 17 00:00:00 2001
From: Leslie Fang <leslief@nvidia.com>
Date: Fri, 14 Nov 2025 12:02:41 +0800
Subject: [PATCH 19/26] [https://nvbugs/5652552][fix] Log the llm args (#9119)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tensorrt_llm/_torch/pyexecutor/py_executor_creator.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
index 923599562ce..75c63b829c5 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -732,5 +732,7 @@ def drafting_loop_wrapper(model):
     if mapping.rank == 0:
         logger.info(f"LLM Args:\n{llm_args}")
 
+    logger.info(f"{llm_args}")
+
     py_executor.start_worker()
     return py_executor

From 225cbfcb742ec7077ff44fc1ac414e482f63fa58 Mon Sep 17 00:00:00 2001
From: brb-nv <169953907+brb-nv@users.noreply.github.com>
Date: Fri, 14 Nov 2025 02:18:24 -0800
Subject: [PATCH 20/26] [https://nvbugs/5568836][fix] Skip keyword matching for
 Gemma3 e2e test (#9158)

Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/defs/test_e2e.py | 43 +++++++++++++++++-------------
 1 file changed, 24 insertions(+), 19 deletions(-)

diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index 679705bb283..c1fe7c90895 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -2561,13 +2561,6 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
             [["invention", "person", "scientists", "Lick", "engineers"],
              ["landscape", "trees", "road", "depicts", "scenic"]]
         },
-        "gemma-3-27b-it": {
-            "image": [
-                ["natural", "turbulent", "dramatic", "scene", "wave"],
-                ["image", "famous", "rock", "granite", "landmark"],
-                ["traffic", "moderate", "heavy", "flowing", "cars"],
-            ],
-        },
     }
 
     cmd = [
@@ -2597,6 +2590,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
 
     output = llm_venv.run_cmd(cmd, caller=check_output)
 
+    # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
+    if model_name == "gemma-3-27b-it":
+        print(
+            f"Skipping keyword matching test for {model_name}. Smoke test completed successfully."
+        )
+        print("output:", output)
+        return
+
     match_ratio = 4.0 / 5
     parsed_outputs = parse_output(output)
     for prompt_output, prompt_keywords in zip(
@@ -2891,12 +2892,6 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
 
     # Define expected keywords for each model
     expected_keywords = {
-        "gemma-3-27b-it": {
-            "image": [
-                ["half", "dome", "yosemite", "landmark", "rounded"],
-                ["flowing", "traffic", "vehicles", "road", "Changi"],
-            ],
-        },
         "mistral-small-3.1-24b-instruct": {
             "image": [
                 ["scenic", "rock", "landscape", "monolith", "formation"],
@@ -2939,6 +2934,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
 
     output = llm_venv.run_cmd(cmd, caller=check_output)
 
+    # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
+    if model_name == "gemma-3-27b-it":
+        print(
+            f"Skipping keyword matching test for {model_name}. Smoke test completed successfully."
+        )
+        print("output:", output)
+        return
+
     # Set match ratio based on model
     match_ratio = 4.0 / 5
 
@@ -2984,12 +2987,6 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
 
     # Define expected keywords for each model
     expected_keywords = {
-        "gemma-3-27b-it": {
-            "image": [
-                ["description", "image", "half", "dome", "park"],
-                ["atmosphere", "peaceful", "majestic", "scene", "sky"],
-            ],
-        },
         "mistral-small-3.1-24b-instruct": {
             "image": [
                 [
@@ -3031,6 +3028,14 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
 
     output = llm_venv.run_cmd(cmd, caller=check_output)
     print("output:", output)
+
+    # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
+    if model_name == "gemma-3-27b-it":
+        print(
+            f"Skipping keyword matching test for {model_name}. Smoke test completed successfully."
+        )
+        return
+
     # Set match ratio based on model
     match_ratio = 4.0 / 5
     if model_name.startswith("Phi-4-multimodal-instruct"):

From 76f898a9454bad8b3a8c75e1621588f8c6812df2 Mon Sep 17 00:00:00 2001
From: Shunkangz <182541032+Shunkangz@users.noreply.github.com>
Date: Mon, 17 Nov 2025 10:44:49 +0800
Subject: [PATCH 21/26] [TRTLLM-9159][doc] Add KV Connector docs (#9043)

Signed-off-by: Shunkang <182541032+Shunkangz@users.noreply.github.co>
Co-authored-by: Shunkang <182541032+Shunkangz@users.noreply.github.co>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 examples/llm-api/llm_kv_cache_connector.py | 89 ++++++++++++++++++++--
 1 file changed, 84 insertions(+), 5 deletions(-)

diff --git a/examples/llm-api/llm_kv_cache_connector.py b/examples/llm-api/llm_kv_cache_connector.py
index 1eac9a9cd98..2e87e0c6868 100644
--- a/examples/llm-api/llm_kv_cache_connector.py
+++ b/examples/llm-api/llm_kv_cache_connector.py
@@ -1,6 +1,84 @@
 ### :title KV Cache Connector
 ### :order 6
 ### :section Customization
+'''
+This script demonstrates the KV cache connector feature in TensorRT-LLM, which enables
+custom persistence and reuse of KV cache blocks across different LLM instances.
+
+**Scenario:**
+The script implements a persistent KV cache connector that saves computed KV cache blocks
+to disk and loads them back in subsequent runs, eliminating redundant computation for
+recurring prompts.
+
+**What is a KV Cache Connector?**
+
+A KV cache connector is a customizable interface that allows you to:
+1.  **Save KV Cache:** Persist computed KV cache blocks to an external storage
+    (disk, database, distributed cache, etc.)
+2.  **Load KV Cache:** Retrieve previously computed cache blocks instead of recomputing them
+3.  **Share Cache Across Instances:** Reuse cache blocks across different LLM instances
+    or sessions, unlike regular block reuse which is limited to a single instance
+
+**How It Works:**
+
+This example implements a `PersistentKvCacheConnector` with two key components:
+
+* **PersistentKvCacheConnectorLeader (Scheduler):**
+    - Hashes token sequences to create unique identifiers for each cache block
+    - Checks if cached blocks exist on disk for incoming requests
+    - Schedules load operations for cache hits
+    - Schedules save operations for newly computed blocks
+
+* **PersistentKvCacheConnectorWorker:**
+    - Executes the actual load/save operations between GPU and disk
+    - Loads cached blocks from disk files into GPU memory
+    - Saves newly computed blocks from GPU to disk files
+
+**Demonstration:**
+
+The script processes the same prompt twice using two separate LLM instances:
+
+1.  **First Run (Instance 1):**
+    - The LLM computes the KV cache for the input prompt
+    - The connector saves the computed cache blocks to disk (as .pt files)
+    - The generation completes and the LLM instance is destroyed
+
+2.  **Second Run (Instance 2):**
+    - A new LLM instance is created with the same connector configuration
+    - When processing the same prompt, the connector finds matching cache blocks on disk
+    - The cache is loaded from disk instead of being recomputed
+    - **Expected Outcome:** Faster prefill as cache blocks are loaded rather than computed
+    - Both outputs should be identical, demonstrating deterministic cache reuse
+
+**Key Benefits:**
+
+- **Cross-Instance Cache Sharing:** Share computed caches across multiple LLM instances
+- **Persistent Storage:** Cache survives beyond the lifetime of a single LLM instance
+- **Custom Storage Backends:** Implement any storage mechanism (shown here: disk files)
+- **Reduced Computation:** Eliminate redundant KV cache computation for repeated prompts
+
+**How to Run:**
+
+```bash
+python llm_kv_cache_connector.py <model_path>
+```
+
+Example:
+```bash
+python llm_kv_cache_connector.py meta-llama/Llama-3.1-8B-Instruct
+```
+
+**Implementation Notes:**
+
+- This example uses content-based hashing to identify cache blocks
+- Cache files are stored in a temporary directory (cleaned up after the demo)
+- The implementation is simplified and not optimized for production use
+- Does not support chunked prefill in this example
+- See `tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py` for the full connector interface
+
+**NOTE:** This example connector implementation is designed for demonstration purposes
+and is NOT suitable for production use without additional optimizations and error handling.
+'''
 
 import os
 import sys
@@ -17,11 +95,6 @@
 from tensorrt_llm.bindings.internal.batch_manager import LlmRequest
 from tensorrt_llm.llmapi.llm_args import KvCacheConnectorConfig, TorchLlmArgs
 
-# This is a simple example of the use of the KV cache connector.
-# It persists KV cache contents into a folder, and can load them back on subsequent runs.
-# See tensorrt_llm/_torch/pyexecutor/connector.py for details about the KV cache connector interface.
-# NOTE: This example connector implementation is NOT suitable for production use.
-
 CONNECTOR_CACHE_FOLDER_KEY = "CONNECTOR_CACHE_FOLDER"
 
 
@@ -198,6 +271,7 @@ def main(model: str):
 
     this_module = __file__[__file__.rfind("/") + 1:__file__.rfind(".py")]
 
+    # --- KV Cache Connector Config ---
     kv_connector_config = KvCacheConnectorConfig(
         connector_module=this_module,
         connector_scheduler_class="PersistentKvCacheConnectorLeader",
@@ -207,6 +281,7 @@ def main(model: str):
     connector_cache_dir = TemporaryDirectory()
     os.environ[CONNECTOR_CACHE_FOLDER_KEY] = connector_cache_dir.name
 
+    # Create LLM instance with KV Cache Connector
     llm = LLM(model=model,
               backend="pytorch",
               cuda_graph_config=None,
@@ -220,6 +295,7 @@ def main(model: str):
 
     sampling_params = SamplingParams(max_tokens=32)
 
+    # Generate text with the first LLM instance and save the kv cache blocks by the connector.
     output = llm.generate([test_text], sampling_params)
     text0 = output[0].outputs[0].text
 
@@ -228,16 +304,19 @@ def main(model: str):
 
     del llm
 
+    # Create a new LLM instance with the same connector configuration
     llm = LLM(model=model,
               backend="pytorch",
               cuda_graph_config=None,
               kv_connector_config=kv_connector_config)
 
+    # Generate text with the second LLM instance and it should reuse the kv cache blocks from the connector.
     output = llm.generate([test_text], sampling_params)
     text1 = output[0].outputs[0].text
 
     print("Second output (using connector cache): ", text1)
 
+    # Verify that the two outputs are identical
     assert text0 == text1
 
     connector_cache_dir.cleanup()

From 96ea4ffa75e3c5672957a6e482b00d86d2a7a033 Mon Sep 17 00:00:00 2001
From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com>
Date: Mon, 17 Nov 2025 15:11:44 +0800
Subject: [PATCH 22/26] [https://nvbugs/5649826][fix] Unwaive test
 test_llm_commandr_plus_4gpus_summary (#9201)

Signed-off-by: qgai <qgai@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/test_lists/qa/llm_function_core.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
index 69c24f0f5ee..b7303bc026f 100644
--- a/tests/integration/test_lists/qa/llm_function_core.txt
+++ b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -14,8 +14,8 @@ examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-ena
 examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only]
 examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only]
 examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only]
-examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (120)
-examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (120)
+examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (180)
+examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (180)
 examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8]
 examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
 examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2]

From a94778a0cb6bc01dec2ca8569285fbebfe5448a3 Mon Sep 17 00:00:00 2001
From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com>
Date: Tue, 18 Nov 2025 19:20:07 +0800
Subject: [PATCH 23/26] [https://nvbugs/5461796][fix] Unwaive and extend time
 for test_llmapi_speculative_decoding_mtp (#9092)

Signed-off-by: qgai <qgai@nvidia.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/integration/test_lists/test-db/l0_sanity_check.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_lists/test-db/l0_sanity_check.yml b/tests/integration/test_lists/test-db/l0_sanity_check.yml
index 68ec272f03d..894bc21b1e7 100644
--- a/tests/integration/test_lists/test-db/l0_sanity_check.yml
+++ b/tests/integration/test_lists/test-db/l0_sanity_check.yml
@@ -25,7 +25,7 @@ l0_sanity_check:
       - llmapi/test_llm_examples.py::test_llmapi_example_multilora
       - llmapi/test_llm_examples.py::test_llmapi_example_guided_decoding
       - llmapi/test_llm_examples.py::test_llmapi_example_logits_processor
-      - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp
+      - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp TIMEOUT (90)
       - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3
       - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_ngram
       - llmapi/test_llm_examples.py::test_llmapi_sampling

From ef4a779a69d25a652cc9d3e2431c9b1e82c6b59b Mon Sep 17 00:00:00 2001
From: QI JUN <22017000+QiJune@users.noreply.github.com>
Date: Wed, 19 Nov 2025 09:36:01 +0800
Subject: [PATCH 24/26] [TRTLLM-9092][doc] Add a pre-quantized example in quick
 start guide (#9223)

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 docs/source/quick-start-guide.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md
index 4d70b2eba84..2c172f66c01 100644
--- a/docs/source/quick-start-guide.md
+++ b/docs/source/quick-start-guide.md
@@ -24,6 +24,13 @@ To start the server, you can run a command like the following example inside a D
 trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 ```
 
+You may also deploy pre-quantized models to improve performance.
+Ensure your GPU supports FP8 quantization before running the following:
+
+```bash
+trtllm-serve "nvidia/Qwen3-8B-FP8"
+```
+
 ```{note}
 If you are running `trtllm-serve` inside a Docker container, you have two options for sending API requests:
 1. Expose a port (e.g., 8000) to allow external access to the server from outside the container.

From e413c8039f65547aa2095c8ef6c992cb28619815 Mon Sep 17 00:00:00 2001
From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com>
Date: Wed, 19 Nov 2025 09:46:02 +0800
Subject: [PATCH 25/26] [https://nvbugs/5648685][fix] Fix openAI server waiting
 time to avoid large model weight loading out time (#9254)

Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tests/unittest/llmapi/apps/openai_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unittest/llmapi/apps/openai_server.py b/tests/unittest/llmapi/apps/openai_server.py
index 39c9988d9f5..b3fde6b94c5 100644
--- a/tests/unittest/llmapi/apps/openai_server.py
+++ b/tests/unittest/llmapi/apps/openai_server.py
@@ -16,7 +16,7 @@
 
 class RemoteOpenAIServer:
     DUMMY_API_KEY = "tensorrt_llm"
-    MAX_SERVER_START_WAIT_S = 600  # wait for server to start for 600 seconds
+    MAX_SERVER_START_WAIT_S = 7200  # wait for server to start for 7200 seconds (~ 2 hours) for LLM models weight loading
 
     def __init__(self,
                  model: str,

From 6d4a5fc7617b1fd9bd041551947a50f0e857a9f6 Mon Sep 17 00:00:00 2001
From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com>
Date: Thu, 20 Nov 2025 19:31:35 +0800
Subject: [PATCH 26/26] =?UTF-8?q?[https://nvbugs/5670793][fix]=20Solve=20t?=
 =?UTF-8?q?rtllm-serve=20launch=5Fdisaggregated=E2=80=A6=20(#9324)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com>
Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com>
Signed-off-by: Mike Iovine <miovine@nvidia.com>
---
 tensorrt_llm/commands/serve.py     | 8 ++++----
 tensorrt_llm/llmapi/mpi_session.py | 7 +++++++
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py
index 372808bc0ca..7c10b7a2377 100644
--- a/tensorrt_llm/commands/serve.py
+++ b/tensorrt_llm/commands/serve.py
@@ -29,7 +29,7 @@
                                               parse_disagg_config_file,
                                               parse_metadata_server_config_file)
 from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict
-from tensorrt_llm.llmapi.mpi_session import find_free_port
+from tensorrt_llm.llmapi.mpi_session import find_free_ipc_addr
 from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory
 from tensorrt_llm.logger import logger, severity_map
 from tensorrt_llm.serve import OpenAIDisaggServer, OpenAIServer
@@ -730,10 +730,10 @@ def _launch_disaggregated_leader(sub_comm, instance_idx: int, config_file: str,
 
     # This mimics the behavior of trtllm-llmapi-launch
     # TODO: Make the port allocation atomic
-    free_port = find_free_port()
+    free_ipc_addr = find_free_ipc_addr()
     os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS] = "1"
-    os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.
-               value] = f"tcp://127.0.0.1:{free_port}"
+    os.environ[
+        LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.value] = free_ipc_addr
     os.environ[DisaggLauncherEnvs.TLLM_DISAGG_RUN_REMOTE_MPI_SESSION_CLIENT.
                value] = "1"
     os.environ[DisaggLauncherEnvs.TLLM_DISAGG_INSTANCE_IDX] = str(instance_idx)
diff --git a/tensorrt_llm/llmapi/mpi_session.py b/tensorrt_llm/llmapi/mpi_session.py
index f0275d7f90a..d32e5a7b7aa 100644
--- a/tensorrt_llm/llmapi/mpi_session.py
+++ b/tensorrt_llm/llmapi/mpi_session.py
@@ -541,6 +541,13 @@ def find_free_port() -> int:
         return s.getsockname()[1]
 
 
+def find_free_ipc_addr() -> str:
+    import os
+    import tempfile
+    import uuid
+    return f'ipc://{os.path.join(tempfile.gettempdir(), "rpc_" + str(uuid.uuid4()))}'
+
+
 def get_mpi_world_size() -> int:
     # avoid cyclic import
     from ..executor.utils import get_spawn_proxy_process_env