From 3dcc23b27f0dcdd09d068a6f21b946fc82eed432 Mon Sep 17 00:00:00 2001 From: yunruis <205571022+yunruis@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:46:01 +0800 Subject: [PATCH 01/26] [https://nvbugs/5606268][fix] Fix program exit segment fault triggered CublasMMWarpper deconstructor (#8834) Signed-off-by: yunruis <205571022+yunruis@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- cpp/tensorrt_llm/common/opUtils.cpp | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index 053f9d9ece7..ae3810a255f 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -179,16 +179,24 @@ class PerCudaCtxPerThreadSingletonCreator PerCudaCtxPerThreadSingletonCreator(CreatorFunc creator, DeleterFunc deleter) : mCreator{std::move(creator)} , mDeleter{std::move(deleter)} + , mObservers{new std::unordered_map, hash>()} { } + ~PerCudaCtxPerThreadSingletonCreator() + { + std::lock_guard lk{mMutex}; + delete mObservers; + mObservers = nullptr; + } + std::shared_ptr operator()() { std::lock_guard lk{mMutex}; CUcontext ctx{getCurrentCudaCtx()}; std::thread::id thread = std::this_thread::get_id(); auto const key = std::make_tuple(ctx, thread); - std::shared_ptr result = mObservers[key].lock(); + std::shared_ptr result = (*mObservers)[key].lock(); if (result == nullptr) { TLLM_LOG_TRACE("creating singleton instance for CUDA context %lu and thread %lu", ctx, thread); @@ -202,6 +210,11 @@ class PerCudaCtxPerThreadSingletonCreator } mDeleter(obj); + if (mObservers == nullptr) + { + return; + } + // Clears observer to avoid growth of mObservers, in case users creates/destroys cuda contexts // frequently. std::shared_ptr observedObjHolder; // Delay destroy to avoid dead lock. @@ -210,17 +223,18 @@ class PerCudaCtxPerThreadSingletonCreator // thread just before we lock mMutex. We can't infer that the observer is stale from the fact that // obj is destroyed, because shared_ptr ref-count checking and observer removing are not in one // atomic operation, and the observer may be changed to observe another instance. - if (mObservers.find(key) == mObservers.end()) + auto it = mObservers->find(key); + if (it == mObservers->end()) { return; } - observedObjHolder = mObservers.at(key).lock(); + observedObjHolder = it->second.lock(); if (observedObjHolder == nullptr) { - mObservers.erase(key); + mObservers->erase(it); } }}; - mObservers.at(key) = result; + (*mObservers)[key] = result; } else { @@ -235,7 +249,7 @@ class PerCudaCtxPerThreadSingletonCreator mutable std::mutex mMutex; // CUDA resources are per-context and per-thread. using CacheKey = std::tuple; - std::unordered_map, hash> mObservers; + std::unordered_map, hash>* mObservers; }; } // namespace @@ -253,6 +267,7 @@ std::shared_ptr getCublasHandle() { TLLM_CUDA_CHECK(cublasDestroy(*handle)); delete handle; + handle = nullptr; }); return creator(); } @@ -270,6 +285,7 @@ std::shared_ptr getCublasLtHandle() { TLLM_CUDA_CHECK(cublasLtDestroy(*handle)); delete handle; + handle = nullptr; }); return creator(); } From 13be0b7aa3acfa9c15cbb8491279b04e2eeb9cc9 Mon Sep 17 00:00:00 2001 From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com> Date: Mon, 3 Nov 2025 15:09:58 +0800 Subject: [PATCH 02/26] [https://nvbugs/5608930][fix] Unwaive test 5608930 (#8831) Signed-off-by: qgai Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 248c7e58bf1..da7ccc01cea 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -358,7 +358,6 @@ accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKI accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854) disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_bf16_empty_batch[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5601682) accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_eagle3[eagle3_one_model=False-overlap_scheduler=False] SKIP (https://nvbugs/5655584) -accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_chunked_prefill SKIP (https://nvbugs/5608930) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832) From f8a9ecf1e04373bb639119e6be0ea9dcfe120bae Mon Sep 17 00:00:00 2001 From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:53:24 +0800 Subject: [PATCH 03/26] [https://nvbugs/5461796][fix] Unwaive test test_llmapi_speculative_decoding_mtp (#8832) Signed-off-by: qgai Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index da7ccc01cea..1856f8dbd5d 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -227,7 +227,6 @@ triton_server/test_triton.py::test_gpt_ib_streaming[gpt-ib-streaming] SKIP (http triton_server/test_triton.py::test_gpt_ib_ptuning[gpt-ib-ptuning] SKIP (https://nvbugs/5445624) triton_server/test_triton.py::test_mistral_ib_mm[mistral-ib-mm] SKIP (https://nvbugs/5371343) triton_server/test_triton.py::test_t5_ib[t5-ib] SKIP (https://nvbugs/5456482) -llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp SKIP (https://nvbugs/5461796) accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph SKIP (https://nvbugs/5365525) examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-mini-128k-instruct] SKIP (https://nvbugs/5465143) examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKIP (https://nvbugs/5465143) From 0db9f3c44943c50d74137034f1c13011d987c2d1 Mon Sep 17 00:00:00 2001 From: brb-nv <169953907+brb-nv@users.noreply.github.com> Date: Mon, 3 Nov 2025 14:49:36 -0800 Subject: [PATCH 04/26] [https://nvbugs/5521253][fix] Enable Gemma3 12B & 27B on SM100 (#8666) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- cpp/kernels/fmha_v2/setup.py | 10 ++++++++++ cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp | 2 +- tests/integration/defs/test_e2e.py | 4 ++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 895b4a4592b..43a175ba803 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -6398,6 +6398,16 @@ def enumerate_kernels(): and kspec.cross_mha == False and kspec.flash_attention == True and kspec.input_layout != InputLayout.SEPARATE_Q_K_V) + # Gemma3 VL support. + or (kspec.sm == 100 + and kspec.dtype in ['fp16', 'bf16', 'fp16_fp32', 'e4m3', 'e4m3_fp32'] + and kspec.head_size == 72 + and kspec.head_size_v == 0 + and kspec.sage_block_sizes is None + and kspec.version == 2 + and kspec.cross_mha == False + and kspec.flash_attention == True + and kspec.input_layout != InputLayout.SEPARATE_Q_K_V) # Deepseek MLA (generation 576/512 paged) or (kspec.sm in [90, 100, 120] and kspec.dtype in ['bf16', 'e4m3_fp32'] diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp index 70327e9ca7a..11b3e1b0ffd 100644 --- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp @@ -49,7 +49,7 @@ FmhaDispatcher::FmhaDispatcher(MHARunnerFixedParams fixedParams) // TRTLLM-GEN only supports power of 2 head sizes. // The exception will fall back to fmha v2. // Please update fmha_v2/setup.py if you want to add more supported head sizes. - , mUseTllmGen(tensorrt_llm::common::isSM100Family() && fixedParams.headSize != 80) + , mUseTllmGen(tensorrt_llm::common::isSM100Family() && fixedParams.headSize != 80 && fixedParams.headSize != 72) { if (mUseTllmGen) { diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 06ba812f921..16bea901095 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2497,6 +2497,10 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv): "gemma/gemma-3-27b-it", marks=(pytest.mark.skip_less_device_memory(80000), skip_post_blackwell)), + pytest.param( + "Nano-v2-VLM", + "Nano-v2-VLM", + marks=pytest.mark.skip(reason="Nano V2 VLM ckpt is not released yet.")), ]) def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, modality, use_cuda_graph): From 06f338b884331883f90fe4fbaebc82be9770c570 Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Tue, 4 Nov 2025 18:12:02 +0800 Subject: [PATCH 05/26] [https://nvbugs/5606266][test] move qwen3 multi-node test to the qa list (#8908) Signed-off-by: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/test_lists/qa/llm_function_multinode.txt | 1 + tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_lists/qa/llm_function_multinode.txt b/tests/integration/test_lists/qa/llm_function_multinode.txt index f2e3f8d216a..8a3958cf33c 100644 --- a/tests/integration/test_lists/qa/llm_function_multinode.txt +++ b/tests/integration/test_lists/qa/llm_function_multinode.txt @@ -11,3 +11,4 @@ test_e2e.py::test_multi_nodes_eval[Kimi-K2-Instruct-tp16-mmlu] test_e2e.py::test_multi_nodes_eval[nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-tp16-mmlu] test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp2pp1-gen_tp2pp1] test_e2e.py::test_openai_disagg_multi_nodes_completion[ctx_tp1pp2-gen_tp1pp2] +accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180) diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml index eb618093379..57c3b6fd810 100644 --- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml +++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml @@ -39,4 +39,3 @@ l0_gb200_multi_nodes: - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] TIMEOUT (90) - - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] TIMEOUT (180) From 883d95cab6ba0bc901330692e3a6444c6372bed5 Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Wed, 5 Nov 2025 01:34:26 +0800 Subject: [PATCH 06/26] [https://nvbugs/5569754][chore] Adjust max batch size to prevent OOM (#8876) Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- examples/llm-api/llm_mgmn_llm_distributed.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/llm-api/llm_mgmn_llm_distributed.sh b/examples/llm-api/llm_mgmn_llm_distributed.sh index bc6b6e16a62..fa23ae2548c 100644 --- a/examples/llm-api/llm_mgmn_llm_distributed.sh +++ b/examples/llm-api/llm_mgmn_llm_distributed.sh @@ -51,5 +51,6 @@ srun -l \ trtllm-llmapi-launch python3 $script \ --model_dir $LOCAL_MODEL \ --prompt 'Hello, how are you?' \ - --tp_size 2 + --tp_size 2 \ + --max_batch_size 256 " From cb36f3e14fdfd3ae670613377cbddd1a53dd8e6f Mon Sep 17 00:00:00 2001 From: Simeng Liu <109828133+SimengLiu-nv@users.noreply.github.com> Date: Tue, 4 Nov 2025 14:11:48 -0800 Subject: [PATCH 07/26] [https://nvbugs/5606136][fix] Fix torch.onnx.export with pytorch upgrade to fallback to dynamo=False. (#8917) Signed-off-by: Simeng Liu Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- .../models/contrib/dit/vae_decoder_trt.py | 21 +++++++++++-------- examples/models/core/qwenvl/vit_onnx_trt.py | 3 ++- tensorrt_llm/tools/multimodal_builder.py | 17 ++++++++------- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/examples/models/contrib/dit/vae_decoder_trt.py b/examples/models/contrib/dit/vae_decoder_trt.py index 1374dbcbfa6..31803a6690d 100644 --- a/examples/models/contrib/dit/vae_decoder_trt.py +++ b/examples/models/contrib/dit/vae_decoder_trt.py @@ -34,15 +34,18 @@ def export_onnx(self, onnxFile): *self.latent_shape).cuda() self.pytorch_model.cuda().eval() with torch.inference_mode(): - torch.onnx.export(self.pytorch_model, - latent, - onnxFile, - opset_version=17, - input_names=['input'], - output_names=['output'], - dynamic_axes={'input': { - 0: 'batch' - }}) + torch.onnx.export( + self.pytorch_model, + latent, + onnxFile, + opset_version=17, + input_names=['input'], + output_names=['output'], + dynamic_axes={'input': { + 0: 'batch' + }}, + # Required for pytorch>=2.9.0 as dynamo becomes the default and introduces bugs as it does not support opset_version=17 natively + dynamo=False) def generate_trt_engine(self, onnxFile, planFile): print(f"Start exporting TRT model to {planFile}!") diff --git a/examples/models/core/qwenvl/vit_onnx_trt.py b/examples/models/core/qwenvl/vit_onnx_trt.py index b667224a6b8..ba21fc93ef0 100644 --- a/examples/models/core/qwenvl/vit_onnx_trt.py +++ b/examples/models/core/qwenvl/vit_onnx_trt.py @@ -89,7 +89,8 @@ def export_onnx(self, onnx_file_path, pretrained_model_path, image_url): dynamic_axes={"input": { 0: "batch" }}, - ) + # Required for pytorch>=2.9.0 as dynamo becomes the default and introduces bugs as it does not support opset_version=17 natively + dynamo=False) release_gc() # Further release memory print( f"Export to ONNX file successfully! The ONNX file stays in {onnx_file_path}" diff --git a/tensorrt_llm/tools/multimodal_builder.py b/tensorrt_llm/tools/multimodal_builder.py index ea643475fd1..3906fbe2747 100644 --- a/tensorrt_llm/tools/multimodal_builder.py +++ b/tensorrt_llm/tools/multimodal_builder.py @@ -163,13 +163,16 @@ def export_onnx(model, logger.log(trt.Logger.INFO, f"Exporting onnx to {onnx_dir}/{onnx_name}") os.makedirs(onnx_dir, exist_ok=True) - torch.onnx.export(model, - input, - f'{onnx_dir}/{onnx_name}', - opset_version=17, - input_names=input_names, - output_names=output_names, - dynamic_axes=dynamic_axes) + torch.onnx.export( + model, + input, + f'{onnx_dir}/{onnx_name}', + opset_version=17, + input_names=input_names, + output_names=output_names, + dynamic_axes=dynamic_axes, + # Required for pytorch>=2.9.0 as dynamo becomes the default and introduces bugs as it does not support opset_version=17 natively + dynamo=False) def build_trt_engine(model_type, From 27f582b7d541f15c0f39cbc5a8055d0f4cc9d23d Mon Sep 17 00:00:00 2001 From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Date: Wed, 5 Nov 2025 10:17:01 +0800 Subject: [PATCH 08/26] =?UTF-8?q?[https://nvbugs/5634220][fix]=20Add=20dev?= =?UTF-8?q?eloper=20guide=20back=20and=20fix=20some=20i=E2=80=A6=20(#8911)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- docs/source/blogs/H100vsA100.md | 2 +- docs/source/blogs/H200launch.md | 2 +- .../tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md | 2 +- docs/source/features/disagg-serving.md | 2 +- docs/source/index.rst | 1 + examples/llm-api/extra-llm-api-config.yml | 5 +++++ examples/models/core/multimodal/README.md | 2 +- examples/sample_weight_stripping/README.md | 2 +- 8 files changed, 12 insertions(+), 6 deletions(-) create mode 100644 examples/llm-api/extra-llm-api-config.yml diff --git a/docs/source/blogs/H100vsA100.md b/docs/source/blogs/H100vsA100.md index 06edd816202..9359863b548 100644 --- a/docs/source/blogs/H100vsA100.md +++ b/docs/source/blogs/H100vsA100.md @@ -28,7 +28,7 @@ TensorRT LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT LLM v0.5.0., TensorRT 9.1 -The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html) +The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/0.21.0/performance/perf-overview.html) Stay tuned for a highlight on Llama coming soon! diff --git a/docs/source/blogs/H200launch.md b/docs/source/blogs/H200launch.md index 6fd0737c33d..39463990368 100644 --- a/docs/source/blogs/H200launch.md +++ b/docs/source/blogs/H200launch.md @@ -21,7 +21,7 @@ TensorRT LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news *(1) Largest batch supported on given TP configuration by power of 2.* *(2) TP = Tensor Parallelism* -Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html). +Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/0.21.0/performance/perf-overview.html). ### H200 vs H100 diff --git a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md index f0d7647d001..fef8dcc93a2 100644 --- a/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md +++ b/docs/source/blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.md @@ -124,7 +124,7 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments. -For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html). +For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/backends/trtllm/README.html). ### Triton Inference Server diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md index cbeea3cc503..ce52b9a3d5e 100644 --- a/docs/source/features/disagg-serving.md +++ b/docs/source/features/disagg-serving.md @@ -94,7 +94,7 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments. -For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html). +For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/backends/trtllm/README.html). ### trtllm-serve diff --git a/docs/source/index.rst b/docs/source/index.rst index 58ef3c76df9..7540b4c96a1 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -77,6 +77,7 @@ Welcome to TensorRT LLM's Documentation! features/ray-orchestrator.md features/torch_compile_and_piecewise_cuda_graph.md + .. toctree:: :maxdepth: 2 :caption: Developer Guide diff --git a/examples/llm-api/extra-llm-api-config.yml b/examples/llm-api/extra-llm-api-config.yml new file mode 100644 index 00000000000..120cfea82e4 --- /dev/null +++ b/examples/llm-api/extra-llm-api-config.yml @@ -0,0 +1,5 @@ +cuda_graph_config: + enable_padding: True + max_batch_size: 16 +moe_config: + backend: trtllm diff --git a/examples/models/core/multimodal/README.md b/examples/models/core/multimodal/README.md index d001424bfc8..d92ec168bb8 100644 --- a/examples/models/core/multimodal/README.md +++ b/examples/models/core/multimodal/README.md @@ -901,7 +901,7 @@ Note that for instruct Vision model, please set the `max_encoder_input_len` as ` ## NeVA -[NeVA](https://docs.nvidia.com/nemo-framework/user-guide/latest/nemotoolkit/multimodal/mllm/neva.html) is a groundbreaking addition to the NeMo Multimodal ecosystem. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM. +[NeVA](https://docs.nvidia.com/nemo-framework/user-guide/latest/vlms/neva.html) is a groundbreaking addition to the NeMo Multimodal ecosystem. This model seamlessly integrates large language-centric models with a vision encoder, that can be deployed in TensorRT-LLM. 1. Generate TRT-LLM engine for NVGPT following example in `examples/models/core/gpt/README.md`. To adhere to the NVGPT conventions of the conversion script, some layer keys have to be remapped using `--nemo_rename_key`. diff --git a/examples/sample_weight_stripping/README.md b/examples/sample_weight_stripping/README.md index a427dd3df45..dcc7d754f79 100644 --- a/examples/sample_weight_stripping/README.md +++ b/examples/sample_weight_stripping/README.md @@ -241,7 +241,7 @@ python3 ../summarize.py --engine_dir engines/llama2-70b-hf-fp8-tp2.refit \ ## Prototype ### Checkpoint Pruner -The checkpoint pruner allows you to strip `Conv` and `Gemm` weights out of a TensorRT LLM [checkpoint](https://nvidia.github.io/TensorRT-LLM/latest/architecture/checkpoint.html). Since these make up the vast majority of weights, the pruner will decrease the size of your checkpoint up to 99%. +The checkpoint pruner allows you to strip `Conv` and `Gemm` weights out of a TensorRT LLM [checkpoint](https://nvidia.github.io/TensorRT-LLM/0.21.0/architecture/checkpoint.html). Since these make up the vast majority of weights, the pruner will decrease the size of your checkpoint up to 99%. When building an engine with a pruned checkpoint, TensorRT LLM fills in the missing weights with random ones. These weights should later be [refit](#engine-refitter) with the original weights to preserve the intended behavior. From 852ca27c1351344686ac1f635c1b9dacdda45590 Mon Sep 17 00:00:00 2001 From: Jin Li <59594262+liji-nv@users.noreply.github.com> Date: Thu, 6 Nov 2025 11:40:50 +0800 Subject: [PATCH 09/26] [https://nvbugs/5467531][fix] Fix moe test and wide ep fake impl (#8883) Signed-off-by: Jin Li <59594262+liji-nv@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- .../modules/fused_moe/fused_moe_wide_ep.py | 27 ++++++++ .../unittest/_torch/modules/test_fused_moe.py | 68 +++++++++---------- 2 files changed, 61 insertions(+), 34 deletions(-) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py index b46e96ddf7d..b06bc6bd125 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py @@ -960,3 +960,30 @@ def load_weights(self, def post_load_weights(self): self.quant_method.post_load_weights(self) + + def forward_fake( + self, + x: Union[torch.Tensor, Fp4QuantizedTensor], + router_logits: torch.Tensor, + *, + do_finalize: bool = True, + output_dtype: Optional[torch.dtype] = None, + all_rank_num_tokens: Optional[List[int]] = None, + use_dp_padding: Optional[bool] = None, + **kwargs, + ) -> Union[torch.Tensor, List[torch.Tensor]]: + moe_output = super().forward_fake( + x, + router_logits, + do_finalize=do_finalize, + output_dtype=torch.bfloat16, + all_rank_num_tokens=all_rank_num_tokens, + use_dp_padding=use_dp_padding, + **kwargs) + if self.alltoall_method_type == AlltoallMethodType.MNNVL: + shape = moe_output.shape + top_k = self.routing_method.experts_per_token + new_shape = [shape[0], top_k, shape[1]] + return moe_output.new_empty(new_shape) + else: + return moe_output diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 65fb89cf608..291daf110f1 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -334,7 +334,7 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type): world_size = 4 dtype = torch.bfloat16 - HIDDEN_SIZE = 2560 + HIDDEN_SIZE = 4096 INTERMEDIATE_SIZE = 1536 NUM_EXPERTS = 72 TOP_K = 6 @@ -350,8 +350,8 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type): x_list = [] m = MAX_NUM_TOKENS while m >= 1: - x = torch.randn((m, HIDDEN_SIZE), dtype=dtype, device="cuda") - x_list.append(x.cuda(i)) + x = torch.randn((m, HIDDEN_SIZE), dtype=dtype) + x_list.append(x) m //= 2 x_abs_max = torch.cat([x.flatten() for x in x_list]).abs().max().float() @@ -396,40 +396,28 @@ def test_fused_moe_alltoall_fp4(alltoall_method_type): w3_sf_block_unswizzled = torch.ops.trtllm.block_scale_interleave_reverse( w3_sf_block.cpu().view(INTERMEDIATE_SIZE, -1)) - w1_input_scale = x_sf_global.cuda(i) - w2_input_scale = x_sf_global.cuda(i) - w3_input_scale = x_sf_global.cuda(i) + weights[f"{expert_id}.w1.weight"] = w1_weight_nvfp4.cpu() + weights[f"{expert_id}.w2.weight"] = w2_weight_nvfp4.cpu() + weights[f"{expert_id}.w3.weight"] = w3_weight_nvfp4.cpu() + weights[f"{expert_id}.w1.weight_scale"] = w1_sf_block_unswizzled + weights[f"{expert_id}.w2.weight_scale"] = w2_sf_block_unswizzled + weights[f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled - weights[f"{expert_id}.w1.weight"] = w1_weight_nvfp4.cuda(i) - weights[f"{expert_id}.w2.weight"] = w2_weight_nvfp4.cuda(i) - weights[f"{expert_id}.w3.weight"] = w3_weight_nvfp4.cuda(i) - weights[ - f"{expert_id}.w1.weight_scale"] = w1_sf_block_unswizzled.cuda(i) - weights[ - f"{expert_id}.w2.weight_scale"] = w2_sf_block_unswizzled.cuda(i) - weights[ - f"{expert_id}.w3.weight_scale"] = w3_sf_block_unswizzled.cuda(i) - - weights[f"{expert_id}.w1.input_scale"] = 1.0 / w1_input_scale.cuda( - i) - weights[f"{expert_id}.w2.input_scale"] = 1.0 / w2_input_scale.cuda( - i) - weights[f"{expert_id}.w3.input_scale"] = 1.0 / w3_input_scale.cuda( - i) - weights[f"{expert_id}.w1.weight_scale_2"] = 1.0 / w3_w1_global.cuda( - i) - weights[f"{expert_id}.w2.weight_scale_2"] = 1.0 / w2_sf_global.cuda( - i) - weights[f"{expert_id}.w3.weight_scale_2"] = 1.0 / w3_w1_global.cuda( - i) + weights[f"{expert_id}.w1.input_scale"] = 1.0 / x_sf_global + weights[f"{expert_id}.w2.input_scale"] = 1.0 / x_sf_global + weights[f"{expert_id}.w3.input_scale"] = 1.0 / x_sf_global + weights[f"{expert_id}.w1.weight_scale_2"] = 1.0 / w3_w1_global.cpu() + weights[f"{expert_id}.w2.weight_scale_2"] = 1.0 / w2_sf_global.cpu() + weights[f"{expert_id}.w3.weight_scale_2"] = 1.0 / w3_w1_global.cpu() x_list_world.append(x_list) weights_world.append(weights) + torch.cuda.synchronize() - def per_rank_test_fused_moe_alltoall(job_id): + def per_rank_test_fused_moe_alltoall(job_id, weights, x_list): routing_method = DefaultMoeRoutingMethod(top_k=TOP_K) mapping = Mapping(world_size=world_size, - rank=mpi_rank(), + rank=job_id, tp_size=world_size, moe_ep_size=world_size, moe_tp_size=1, @@ -437,8 +425,8 @@ def per_rank_test_fused_moe_alltoall(job_id): torch.cuda.set_device(mapping.rank) torch.manual_seed(mapping.rank) - x_list = x_list_world[mapping.rank] - weights = weights_world[mapping.rank] + weights = {k: v.cuda() for k, v in weights.items()} + x_list = [x.cuda() for x in x_list] quant_config = QuantConfig(quant_algo=QuantAlgo.NVFP4) with mock.patch.object(WideEPMoE, @@ -489,6 +477,16 @@ def per_rank_test_fused_moe_alltoall(job_id): router_logits, all_rank_num_tokens=all_rank_num_tokens, use_dp_padding=False) + # Verify the fake impl is correct. + output_fake = alltoall_model.forward_fake( + x, + router_logits, + all_rank_num_tokens=all_rank_num_tokens, + use_dp_padding=False) + assert output_fake.shape == output.shape + assert output_fake.dtype == output.dtype + if len(output.shape) == 3: + output = torch.sum(output, dim=1, keepdim=False) ref_output = ref_model.forward( x, router_logits, @@ -500,8 +498,10 @@ def per_rank_test_fused_moe_alltoall(job_id): m //= 2 with MPIPoolExecutor(max_workers=world_size) as executor: - results = executor.map(per_rank_test_fused_moe_alltoall, - range(world_size)) + results = executor.map( + per_rank_test_fused_moe_alltoall, + *zip(*[(i, weights_world[i], x_list_world[i]) + for i in range(world_size)])) for r in results: assert r is None From 0a32f26ff8ae9b3ca15d02afebc722f4339e8319 Mon Sep 17 00:00:00 2001 From: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Date: Fri, 7 Nov 2025 15:13:29 +0800 Subject: [PATCH 10/26] [https://nvbugs/5636946][fix] Update test model (#8993) Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- .../defs/accuracy/test_llm_api_pytorch.py | 133 ++++++++++++++++++ .../test_lists/qa/llm_function_rtx6k.txt | 2 +- 2 files changed, 134 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index e4895ee9e0f..4d3b39a7b2d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2186,6 +2186,139 @@ def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, # task.evaluate(llm, # extra_evaluator_kwargs=dict(apply_chat_template=True)) + @skip_pre_blackwell + @pytest.mark.parametrize( + "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", + [ + # Use a larger batch_size to speed up the tests + pytest.param(8, + 1, + 4, + 3, + False, + False, + True, + True, + 32, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(8, + 1, + 4, + 3, + False, + False, + True, + True, + 32, + "TRTLLM", + marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(8, + 1, + 8, + 0, + True, + True, + True, + True, + 32, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(8, + 1, + 1, + 0, + True, + True, + True, + True, + 32, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(4, + 1, + 1, + 0, + True, + True, + True, + True, + 16, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(4)), + pytest.param(8, + 1, + 8, + 1, + True, + True, + True, + True, + 32, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(8)), + pytest.param(8, + 1, + 8, + 1, + True, + True, + True, + True, + 8, + "CUTLASS", + marks=pytest.mark.skip_less_mpi_world_size(8)), + ], + ids=[ + "latency", "latency_trtllmgen", "throughput", "throughput_tp8", + "throughput_tp4", "throughput_mtp", "throughput_bs8_mtp" + ]) + def test_nvfp4_multi_gpus_sm120(self, tp_size, pp_size, ep_size, mtp_nextn, + fp8kv, attention_dp, cuda_graph, + overlap_scheduler, max_batch_size, + moe_backend): + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig(backend=moe_backend)) + + if fp8kv: + kv_cache_config.dtype = "fp8" + + mtp_config = None + if mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-0528-FP4-v2", + max_batch_size=max_batch_size, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + **pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) as llm: + + assert llm.args.moe_config.backend == moe_backend + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + # This covers the case with relatively large seqlen in the generation phase. + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + # Commented out because GPQA takes too long to run + # task = GPQADiamond(self.MODEL_NAME) + # task.evaluate(llm, + # extra_evaluator_kwargs=dict(apply_chat_template=True)) + @skip_pre_blackwell @pytest.mark.parametrize( "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,max_batch_size,moe_backend", diff --git a/tests/integration/test_lists/qa/llm_function_rtx6k.txt b/tests/integration/test_lists/qa/llm_function_rtx6k.txt index ed257078678..88b1e1ebd14 100644 --- a/tests/integration/test_lists/qa/llm_function_rtx6k.txt +++ b/tests/integration/test_lists/qa/llm_function_rtx6k.txt @@ -30,7 +30,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtl accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_eagle3] -accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] +accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_sm120[throughput_tp8] test_e2e.py::test_ptp_quickstart_advanced_mixed_precision test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] From 9fe24d60b05885ab5c2d0e198fdcd1c90bae5722 Mon Sep 17 00:00:00 2001 From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Date: Sat, 8 Nov 2025 00:27:12 +0800 Subject: [PATCH 11/26] [None][doc] Replace the relative links with absolute links in README.md. (#8997) Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4bcab9d94b6..4c6745dc5b2 100644 --- a/README.md +++ b/README.md @@ -230,7 +230,7 @@ Serverless TensorRT LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co TensorRT LLM is an open-sourced library for optimizing Large Language Model (LLM) inference. It provides state-of-the-art optimizations, including custom attention kernels, inflight batching, paged KV caching, quantization (FP8, [FP4](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/), INT4 [AWQ](https://arxiv.org/abs/2306.00978), INT8 [SmoothQuant](https://arxiv.org/abs/2211.10438), ...), speculative decoding, and much more, to perform inference efficiently on NVIDIA GPUs. -[Architected on PyTorch](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/torch/arch_overview.md), TensorRT LLM provides a high-level Python [LLM API](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server). +[Architected on PyTorch](https://github.com/NVIDIA/TensorRT-LLM/blob/release/1.1/docs/source/developer-guide/overview.md), TensorRT LLM provides a high-level Python [LLM API](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html#llm-api) that supports a wide range of inference setups - from single-GPU to multi-GPU or multi-node deployments. It includes built-in support for various parallelism strategies and advanced features. The LLM API integrates seamlessly with the broader inference ecosystem, including NVIDIA [Dynamo](https://github.com/ai-dynamo/dynamo) and the [Triton Inference Server](https://github.com/triton-inference-server/server). TensorRT LLM is designed to be modular and easy to modify. Its PyTorch-native architecture allows developers to experiment with the runtime or extend functionality. Several popular models are also pre-defined and can be customized using [native PyTorch code](./tensorrt_llm/_torch/models/modeling_deepseekv3.py), making it easy to adapt the system to specific needs. From 88a398ce23d9b3b32381b55cc0a538d97a1c9d7c Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Sat, 8 Nov 2025 02:14:00 +0800 Subject: [PATCH 12/26] [https://nvbugs/5575920][fix] Fix cublas/cublasLt handle creation memory not sufficient error (#8900) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- cpp/tensorrt_llm/common/opUtils.cpp | 78 +++++++++++++++++-- .../defs/accuracy/test_llm_api_pytorch.py | 12 ++- 2 files changed, 82 insertions(+), 8 deletions(-) diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index ae3810a255f..736cd1c48d0 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -252,6 +252,46 @@ class PerCudaCtxPerThreadSingletonCreator std::unordered_map, hash>* mObservers; }; +// Structure to hold memory information +struct MemoryInfo +{ + size_t free_mb; + size_t total_mb; + float free_percent; +}; + +// Helper function to get current memory information +MemoryInfo getMemoryInfo() +{ + size_t free_mem = 0, total_mem = 0; + TLLM_CUDA_CHECK(cudaMemGetInfo(&free_mem, &total_mem)); + + size_t const free_mb = free_mem / (1024 * 1024); + size_t const total_mb = total_mem / (1024 * 1024); + float const free_percent = (total_mem > 0) ? (static_cast(free_mem) / total_mem * 100.0f) : 0.0f; + + return {free_mb, total_mb, free_percent}; +} + +// Helper function to log current memory usage +void logMemoryUsage(char const* operation, CUcontext ctx) +{ + auto const mem = getMemoryInfo(); + TLLM_LOG_DEBUG("%s: Context=%p, Free Memory=%zu MB (%.1f%%), Total=%zu MB", operation, ctx, mem.free_mb, + mem.free_percent, mem.total_mb); +} + +// Helper function to throw +void throwCublasErrorWithMemInfo(char const* operation, CUcontext ctx, cublasStatus_t status) +{ + auto const mem = getMemoryInfo(); + TLLM_THROW( + "Failed to create %s. " + "Status: %d, Context: %p, Free Memory: %zu MB (%.1f%%), Total: %zu MB. " + "Consider reducing kv_cache_config.free_gpu_memory_fraction.", + operation, status, ctx, mem.free_mb, mem.free_percent, mem.total_mb); +} + } // namespace std::shared_ptr getCublasHandle() @@ -259,13 +299,26 @@ std::shared_ptr getCublasHandle() static PerCudaCtxPerThreadSingletonCreator creator( []() -> auto { - auto handle = std::unique_ptr(new cublasHandle_t); - TLLM_CUDA_CHECK(cublasCreate(handle.get())); + CUcontext ctx = getCurrentCudaCtx(); + logMemoryUsage("Creating cublas handle", ctx); + + auto handle = std::make_unique(); + auto status = cublasCreate(handle.get()); + + if (status != CUBLAS_STATUS_SUCCESS) + { + throwCublasErrorWithMemInfo("cublas handle", ctx, status); + } + return handle; }, [](cublasHandle_t* handle) { - TLLM_CUDA_CHECK(cublasDestroy(*handle)); + auto status = cublasDestroy(*handle); + if (status != CUBLAS_STATUS_SUCCESS) + { + TLLM_LOG_WARNING("Failed to destroy cublas handle. Status: %d", status); + } delete handle; handle = nullptr; }); @@ -277,13 +330,26 @@ std::shared_ptr getCublasLtHandle() static PerCudaCtxPerThreadSingletonCreator creator( []() -> auto { - auto handle = std::unique_ptr(new cublasLtHandle_t); - TLLM_CUDA_CHECK(cublasLtCreate(handle.get())); + CUcontext ctx = getCurrentCudaCtx(); + logMemoryUsage("Creating cublasLt handle", ctx); + + auto handle = std::make_unique(); + auto status = cublasLtCreate(handle.get()); + + if (status != CUBLAS_STATUS_SUCCESS) + { + throwCublasErrorWithMemInfo("cublasLt handle", ctx, status); + } + return handle; }, [](cublasLtHandle_t* handle) { - TLLM_CUDA_CHECK(cublasLtDestroy(*handle)); + auto status = cublasLtDestroy(*handle); + if (status != CUBLAS_STATUS_SUCCESS) + { + TLLM_LOG_WARNING("Failed to destroy cublasLt handle. Status: %d", status); + } delete handle; handle = nullptr; }); diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 4d3b39a7b2d..2bd4707a479 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -164,7 +164,11 @@ def test_fp8(self, fp8kv, attn_backend, torch_compile): disable_overlap_scheduler=torch_compile, ) if fp8kv: - pytorch_config["kv_cache_config"] = KvCacheConfig(dtype="fp8") + pytorch_config["kv_cache_config"] = KvCacheConfig( + dtype="fp8", + free_gpu_memory_fraction= + 0.8, # Prevent cublas/cublasLt handle allocation memory insufficient errors + ) with LLM( f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8", **pytorch_config) as llm: @@ -199,7 +203,11 @@ def test_fp8_4gpus(self, tp_size, pp_size, fp8kv, attn_backend, disable_overlap_scheduler=torch_compile, ) if fp8kv: - pytorch_config["kv_cache_config"] = KvCacheConfig(dtype="fp8") + pytorch_config["kv_cache_config"] = KvCacheConfig( + dtype="fp8", + free_gpu_memory_fraction= + 0.8, # Prevent cublas/cublasLt handle allocation memory insufficient errors + ) with LLM( f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8", tensor_parallel_size=tp_size, From aa11be1185d5b6d432484e390ab5ee79533d6cdb Mon Sep 17 00:00:00 2001 From: Guoming Zhang <137257613+nv-guomingz@users.noreply.github.com> Date: Mon, 10 Nov 2025 13:44:16 +0800 Subject: [PATCH 13/26] =?UTF-8?q?[TRTLLM-9073][doc]=20Add=20the=20missing?= =?UTF-8?q?=20content=20for=20model=20support=20section=20and=20fix?= =?UTF-8?q?=E2=80=A6=20(#9033)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: nv-guomingz <137257613+nv-guomingz@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- docs/source/overview.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/overview.md b/docs/source/overview.md index fe44002b167..c132756d551 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -23,7 +23,7 @@ TensorRT LLM delivers breakthrough performance on the latest NVIDIA GPUs: ### 🎯 **Comprehensive Model Support** -TensorRT LLM supports the latest and most popular LLM architectures: +TensorRT LLM supports the latest and most popular LLM [architectures](https://nvidia.github.io/TensorRT-LLM/models/supported-models.html). - **Language Models**: GPT-OSS, Deepseek-R1/V3, Llama 3/4, Qwen2/3, Gemma 3, Phi 4... - **Multi-modal Models**: LLaVA-NeXT, Qwen2-VL, VILA, Llama 3.2 Vision... From b10e908a1cc75ab0e0c825ad3af25d24c2d0fa04 Mon Sep 17 00:00:00 2001 From: Vincent Zhang Date: Tue, 11 Nov 2025 13:00:47 +0800 Subject: [PATCH 14/26] [https://nvbugs/5284463][fix] fix ada fp8 group gemm lacks shared memory (#9044) Signed-off-by: Vincent Zhang Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- .../kernels/cutlass_kernels/cutlass_heuristic.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index 7c81a5d7b56..1283d8936e4 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -177,13 +177,13 @@ std::vector get_candidate_tiles( { if (sm == 89 || sm >= 120) { - return {CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128, - CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64, + return {CutlassTileConfig::CtaShape32x128x64_WarpShape32x32x64, CutlassTileConfig::CtaShape64x128x64_WarpShape64x32x64, CutlassTileConfig::CtaShape64x64x128_WarpShape32x64x64, CutlassTileConfig::CtaShape128x64x64_WarpShape64x32x64, CutlassTileConfig::CtaShape128x256x64_WarpShape64x64x64, - CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64}; + CutlassTileConfig::CtaShape256x128x64_WarpShape64x64x64, + CutlassTileConfig::CtaShape16x256x128_WarpShape16x64x128}; } else { From eeb69fc1cb284aa25022f49acd43cd15188cfe18 Mon Sep 17 00:00:00 2001 From: peaceh-nv <103117813+peaceh-nv@users.noreply.github.com> Date: Tue, 11 Nov 2025 15:42:08 +0800 Subject: [PATCH 15/26] [https://nvbugs/5570575][fix] : Use less kv cache memory on SM120 (#9054) Signed-off-by: peaceh <103117813+peaceh-nv@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/defs/test_e2e.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 16bea901095..679705bb283 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2466,6 +2466,7 @@ def test_ptp_quickstart_advanced_2gpus_sm120(llm_root, llm_venv, model_name, f"{llm_models_root()}/{model_path}", "--tp_size=2", "--max_num_tokens=256", + f"--kv_cache_fraction={_MEM_FRACTION_50}", ]) From 62c2d989107f6ceed61424f22e56c450454d85c7 Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Wed, 12 Nov 2025 16:08:05 +0800 Subject: [PATCH 16/26] [https://nvbugs/5628952][fix] avoid cudaFree overlap with cuda graph (#8903) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/test_lists/waives.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 1856f8dbd5d..e201ad66729 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -351,7 +351,6 @@ unittest/bindings/test_hostfunc.py::test_hostfunc SKIP (https://nvbugs/5643631) examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441) -accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_nixl_backend SKIP (https://nvbugs/5651824) unittest/_torch/modules SKIP (https://nvbugs/5637037) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[False] SKIP (https://nvbugs/5651854) accuracy/test_disaggregated_serving.py::TestQwen3_8B::test_auto_dtype[True] SKIP (https://nvbugs/5651854) From e1bcf6b3a6207e2919bc3126aef29a7795248cff Mon Sep 17 00:00:00 2001 From: Michal Guzek Date: Thu, 13 Nov 2025 05:17:20 -0800 Subject: [PATCH 17/26] [https://nvbugs/5628204][fix] Stop token IDs - fast path optimization for single stop token IDs only (#9014) Signed-off-by: Michal Guzek Signed-off-by: Michal Guzek Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tensorrt_llm/_torch/pyexecutor/sampler.py | 16 +++- .../integration/test_lists/test-db/l0_a10.yml | 1 + .../_torch/sampler/test_trtllm_sampler.py | 84 ++++++++++++++++++- 3 files changed, 96 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py index 99ad3453b7e..c2c4032d785 100644 --- a/tensorrt_llm/_torch/pyexecutor/sampler.py +++ b/tensorrt_llm/_torch/pyexecutor/sampler.py @@ -698,12 +698,24 @@ def _meet_max_token_stop_criteria(request: LlmRequest, max_seq_len: int): ) @staticmethod - def _meet_stop_token_criteria(request: LlmRequest): + def _meet_stop_token_criteria(request: LlmRequest, new_token: int): if request.py_stop_words_list: assert isinstance(request.py_stop_words_list, list), ( "request.py_stop_words_list should be a list" ) stop_words_list, prefix_sum = request.py_stop_words_list + + # Determine max stop word length to decide optimization path + max_stop_word_length = prefix_sum[0] if prefix_sum else 0 + for i in range(1, len(prefix_sum)): + word_length = prefix_sum[i] - prefix_sum[i - 1] + max_stop_word_length = max(max_stop_word_length, word_length) + + # Fast path: all stop words are single tokens + if max_stop_word_length == 1: + return new_token in stop_words_list + + # Slow path: at least one multi-token stop word exists tokens = request.get_tokens(0) offset = 0 for i, offset_end in enumerate(prefix_sum): @@ -730,7 +742,7 @@ def _handle_stop_criteria( request.finish_by(FinishReason.LENGTH, BEAM) return True - if cls._meet_stop_token_criteria(request): + if cls._meet_stop_token_criteria(request, new_token): request.finish_by(FinishReason.STOP_WORDS, BEAM) return True diff --git a/tests/integration/test_lists/test-db/l0_a10.yml b/tests/integration/test_lists/test-db/l0_a10.yml index 3bca2c6eced..1b79caaec78 100644 --- a/tests/integration/test_lists/test-db/l0_a10.yml +++ b/tests/integration/test_lists/test-db/l0_a10.yml @@ -19,6 +19,7 @@ l0_a10: - unittest/utils/test_util.py - unittest/_torch/modeling/test_modeling_mistral.py - unittest/_torch/modeling/test_modeling_pixtral.py + - unittest/_torch/sampler/test_trtllm_sampler.py # NOTE: this is a CPU-only test, but we do not have a dedicated job for this (and therefore no # test list either). - unittest/_torch/models/checkpoints/hf/test_weight_loader.py diff --git a/tests/unittest/_torch/sampler/test_trtllm_sampler.py b/tests/unittest/_torch/sampler/test_trtllm_sampler.py index dec50239c13..355ab4cce73 100644 --- a/tests/unittest/_torch/sampler/test_trtllm_sampler.py +++ b/tests/unittest/_torch/sampler/test_trtllm_sampler.py @@ -12,8 +12,10 @@ def model_path(): return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0" -def create_llm(model_dir): - """Create LLM with specific overlap scheduler setting""" +def _create_llm_base(model_dir, enable_trtllm_sampler): + """Base LLM creation with configurable sampler.""" + sampler_type = "TRTLLMSampler" if enable_trtllm_sampler else "TorchSampler" + trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False) return LLM( @@ -22,13 +24,23 @@ def create_llm(model_dir): trust_remote_code=True, enable_chunked_prefill=True, cuda_graph_config=CudaGraphConfig(), + sampler_type=sampler_type, kv_cache_config=trt_kv_cache_config, - sampler_type="TRTLLMSampler", max_num_tokens= 128 # Only one request longer than max_num_tokens is required to test chunked prefill ) +def create_llm(model_dir): + """Create LLM with specific overlap scheduler setting""" + return _create_llm_base(model_dir, enable_trtllm_sampler=True) + + +def create_llm_with_torch_sampler(model_dir): + """Create LLM with TorchSampler.""" + return _create_llm_base(model_dir, enable_trtllm_sampler=False) + + @pytest.mark.high_cuda_memory def test_trtllm_sampler(model_path): prompts = [ @@ -68,3 +80,69 @@ def test_trtllm_sampler(model_path): # Verify outputs are consistent for text, expected in zip(texts, expected_outputs): assert similar(text, expected), f"text: {text}, expected: {expected}" + + +@pytest.mark.high_cuda_memory +def test_trtllm_sampler_with_stop_token_ids(model_path): + """Test sampler with stop_token_ids (fast path optimization).""" + + llm = create_llm_with_torch_sampler(model_path) + tokenizer = llm.tokenizer + + prompt = "The capital of France is" + target_sentence = "The capital of France is Paris" + + prompt_tokens = tokenizer.encode(prompt, add_special_tokens=False) + target_tokens = tokenizer.encode(target_sentence, add_special_tokens=False) + + # Use the first token after the prompt as the stop token + assert len(target_tokens) > len( + prompt_tokens), "Target must be longer than prompt" + stop_token_id = target_tokens[len(prompt_tokens)] + + sampling_config = SamplingParams(max_tokens=100, + n=1, + stop_token_ids=[stop_token_id], + temperature=0.0) + + outputs = llm.generate([prompt], sampling_params=sampling_config) + text = outputs[0].outputs[0].text + + output_tokens = tokenizer.encode(text, add_special_tokens=False) + + llm.shutdown() + assert stop_token_id not in output_tokens, f"Output should not contain stop token {stop_token_id}" + assert len(output_tokens + ) < 10, "Should stop very early with first-token stop_token_id" + + +@pytest.mark.high_cuda_memory +def test_torch_sampler_with_multi_token_stop_words(model_path): + """Test TorchSampler with multi-token stop words (slow path).""" + + llm = create_llm_with_torch_sampler(model_path) + tokenizer = llm.tokenizer + + prompt = "The capital of France is" + + # Use a string that will tokenize to multiple tokens + stop_string = "\n\n" + stop_tokens = tokenizer.encode(stop_string, add_special_tokens=False) + + assert len( + stop_tokens + ) > 1, f"Stop string should be multi-token, got {len(stop_tokens)} tokens" + + sampling_config = SamplingParams( + max_tokens=100, + n=1, + stop=[stop_string], # Use 'stop' parameter for multi-token + temperature=0.0) + + outputs = llm.generate([prompt], sampling_params=sampling_config) + text = outputs[0].outputs[0].text + + llm.shutdown() + + assert len(text) > 0, "Should generate some text" + assert stop_string not in text, f"Stop string '{repr(stop_string)}' should not appear in the output" From 02da938cf16c506a62c2eaa87c8621dbb7867814 Mon Sep 17 00:00:00 2001 From: Chang Liu <9713593+chang-l@users.noreply.github.com> Date: Thu, 13 Nov 2025 14:58:14 -0800 Subject: [PATCH 18/26] [TRTLLM-7971][doc] Doc update for multimodal in v1.1 (#9015) Signed-off-by: Chang Liu (Enterprise Products) <9713593+chang-l@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- .../legacy/reference/multimodal-feature-support-matrix.md | 4 ++-- docs/source/models/supported-models.md | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/source/legacy/reference/multimodal-feature-support-matrix.md b/docs/source/legacy/reference/multimodal-feature-support-matrix.md index d0cf2372684..b6d99e24ca6 100644 --- a/docs/source/legacy/reference/multimodal-feature-support-matrix.md +++ b/docs/source/legacy/reference/multimodal-feature-support-matrix.md @@ -7,7 +7,7 @@ | VILA | Yes | No | No | No | | LLaVA-NeXT | Yes | Yes | Yes | Yes | | Llama 4 | Yes | Yes | No | No | -| Mistral-Small-3.1 | Yes | Yes | No | No | -| Phi-4-multimodal | Yes | Yes | No | No | +| Mistral-Small-3.1 | Yes | Yes | Yes | Yes | +| Phi-4-multimodal | Yes | Yes | Yes | Yes | | Qwen2-VL | Yes | Yes | Yes | Yes | | Qwen2.5-VL | Yes | Yes | Yes | Yes | diff --git a/docs/source/models/supported-models.md b/docs/source/models/supported-models.md index 749cfcc21d9..c6b6194b5d0 100644 --- a/docs/source/models/supported-models.md +++ b/docs/source/models/supported-models.md @@ -50,13 +50,13 @@ Note: Support for other models may vary. Features marked "N/A" are not applicabl | `Gemma3ForConditionalGeneration` | Yes | Yes | N/A | Yes | Yes | N/A | Yes | No | L + I | | `HCXVisionForCausalLM` | Yes | Yes | No | Yes | Yes | Yes | Yes | No | L + I | | `LlavaLlamaModel (VILA)` | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I + V | -| `LlavaNextForConditionalGeneration` | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I | +| `LlavaNextForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | L + I | | `Llama4ForConditionalGeneration` | Yes | Yes | No | Yes | Yes | No | Yes | No | L + I | | `Mistral3ForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | L + I | -| `NemotronH_Nano_VL_V2` | Yes | Yes | Yes | Yes | Yes | No | Yes | No | L + I + V | +| `NemotronH_Nano_VL_V2` | Yes | Yes | Yes | Yes | Yes | N/A | Yes | No | L + I + V | | `Phi4MMForCausalLM` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | L + I + A | -| `Qwen2VLForConditionalGeneration` | Yes | Yes | No | Yes | Yes | Yes | Yes | No | L + I + V | -| `Qwen2_5_VLForConditionalGeneration` | Yes | Yes | No | Yes | Yes | Yes | Yes | No | L + I + V | +| `Qwen2VLForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | L + I + V | +| `Qwen2_5_VLForConditionalGeneration` | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | L + I + V | Note: - L: Language From 2cfd34f5286254d1f27b13fb1859b74f84543ea9 Mon Sep 17 00:00:00 2001 From: Leslie Fang Date: Fri, 14 Nov 2025 12:02:41 +0800 Subject: [PATCH 19/26] [https://nvbugs/5652552][fix] Log the llm args (#9119) Signed-off-by: leslie-fang25 Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tensorrt_llm/_torch/pyexecutor/py_executor_creator.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py index 923599562ce..75c63b829c5 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py @@ -732,5 +732,7 @@ def drafting_loop_wrapper(model): if mapping.rank == 0: logger.info(f"LLM Args:\n{llm_args}") + logger.info(f"{llm_args}") + py_executor.start_worker() return py_executor From 225cbfcb742ec7077ff44fc1ac414e482f63fa58 Mon Sep 17 00:00:00 2001 From: brb-nv <169953907+brb-nv@users.noreply.github.com> Date: Fri, 14 Nov 2025 02:18:24 -0800 Subject: [PATCH 20/26] [https://nvbugs/5568836][fix] Skip keyword matching for Gemma3 e2e test (#9158) Signed-off-by: Balaram Buddharaju <169953907+brb-nv@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/defs/test_e2e.py | 43 +++++++++++++++++------------- 1 file changed, 24 insertions(+), 19 deletions(-) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 679705bb283..c1fe7c90895 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2561,13 +2561,6 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, [["invention", "person", "scientists", "Lick", "engineers"], ["landscape", "trees", "road", "depicts", "scenic"]] }, - "gemma-3-27b-it": { - "image": [ - ["natural", "turbulent", "dramatic", "scene", "wave"], - ["image", "famous", "rock", "granite", "landmark"], - ["traffic", "moderate", "heavy", "flowing", "cars"], - ], - }, } cmd = [ @@ -2597,6 +2590,14 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, output = llm_venv.run_cmd(cmd, caller=check_output) + # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky. + if model_name == "gemma-3-27b-it": + print( + f"Skipping keyword matching test for {model_name}. Smoke test completed successfully." + ) + print("output:", output) + return + match_ratio = 4.0 / 5 parsed_outputs = parse_output(output) for prompt_output, prompt_keywords in zip( @@ -2891,12 +2892,6 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, # Define expected keywords for each model expected_keywords = { - "gemma-3-27b-it": { - "image": [ - ["half", "dome", "yosemite", "landmark", "rounded"], - ["flowing", "traffic", "vehicles", "road", "Changi"], - ], - }, "mistral-small-3.1-24b-instruct": { "image": [ ["scenic", "rock", "landscape", "monolith", "formation"], @@ -2939,6 +2934,14 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, output = llm_venv.run_cmd(cmd, caller=check_output) + # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky. + if model_name == "gemma-3-27b-it": + print( + f"Skipping keyword matching test for {model_name}. Smoke test completed successfully." + ) + print("output:", output) + return + # Set match ratio based on model match_ratio = 4.0 / 5 @@ -2984,12 +2987,6 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, # Define expected keywords for each model expected_keywords = { - "gemma-3-27b-it": { - "image": [ - ["description", "image", "half", "dome", "park"], - ["atmosphere", "peaceful", "majestic", "scene", "sky"], - ], - }, "mistral-small-3.1-24b-instruct": { "image": [ [ @@ -3031,6 +3028,14 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, output = llm_venv.run_cmd(cmd, caller=check_output) print("output:", output) + + # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky. + if model_name == "gemma-3-27b-it": + print( + f"Skipping keyword matching test for {model_name}. Smoke test completed successfully." + ) + return + # Set match ratio based on model match_ratio = 4.0 / 5 if model_name.startswith("Phi-4-multimodal-instruct"): From 76f898a9454bad8b3a8c75e1621588f8c6812df2 Mon Sep 17 00:00:00 2001 From: Shunkangz <182541032+Shunkangz@users.noreply.github.com> Date: Mon, 17 Nov 2025 10:44:49 +0800 Subject: [PATCH 21/26] [TRTLLM-9159][doc] Add KV Connector docs (#9043) Signed-off-by: Shunkang <182541032+Shunkangz@users.noreply.github.co> Co-authored-by: Shunkang <182541032+Shunkangz@users.noreply.github.co> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- examples/llm-api/llm_kv_cache_connector.py | 89 ++++++++++++++++++++-- 1 file changed, 84 insertions(+), 5 deletions(-) diff --git a/examples/llm-api/llm_kv_cache_connector.py b/examples/llm-api/llm_kv_cache_connector.py index 1eac9a9cd98..2e87e0c6868 100644 --- a/examples/llm-api/llm_kv_cache_connector.py +++ b/examples/llm-api/llm_kv_cache_connector.py @@ -1,6 +1,84 @@ ### :title KV Cache Connector ### :order 6 ### :section Customization +''' +This script demonstrates the KV cache connector feature in TensorRT-LLM, which enables +custom persistence and reuse of KV cache blocks across different LLM instances. + +**Scenario:** +The script implements a persistent KV cache connector that saves computed KV cache blocks +to disk and loads them back in subsequent runs, eliminating redundant computation for +recurring prompts. + +**What is a KV Cache Connector?** + +A KV cache connector is a customizable interface that allows you to: +1. **Save KV Cache:** Persist computed KV cache blocks to an external storage + (disk, database, distributed cache, etc.) +2. **Load KV Cache:** Retrieve previously computed cache blocks instead of recomputing them +3. **Share Cache Across Instances:** Reuse cache blocks across different LLM instances + or sessions, unlike regular block reuse which is limited to a single instance + +**How It Works:** + +This example implements a `PersistentKvCacheConnector` with two key components: + +* **PersistentKvCacheConnectorLeader (Scheduler):** + - Hashes token sequences to create unique identifiers for each cache block + - Checks if cached blocks exist on disk for incoming requests + - Schedules load operations for cache hits + - Schedules save operations for newly computed blocks + +* **PersistentKvCacheConnectorWorker:** + - Executes the actual load/save operations between GPU and disk + - Loads cached blocks from disk files into GPU memory + - Saves newly computed blocks from GPU to disk files + +**Demonstration:** + +The script processes the same prompt twice using two separate LLM instances: + +1. **First Run (Instance 1):** + - The LLM computes the KV cache for the input prompt + - The connector saves the computed cache blocks to disk (as .pt files) + - The generation completes and the LLM instance is destroyed + +2. **Second Run (Instance 2):** + - A new LLM instance is created with the same connector configuration + - When processing the same prompt, the connector finds matching cache blocks on disk + - The cache is loaded from disk instead of being recomputed + - **Expected Outcome:** Faster prefill as cache blocks are loaded rather than computed + - Both outputs should be identical, demonstrating deterministic cache reuse + +**Key Benefits:** + +- **Cross-Instance Cache Sharing:** Share computed caches across multiple LLM instances +- **Persistent Storage:** Cache survives beyond the lifetime of a single LLM instance +- **Custom Storage Backends:** Implement any storage mechanism (shown here: disk files) +- **Reduced Computation:** Eliminate redundant KV cache computation for repeated prompts + +**How to Run:** + +```bash +python llm_kv_cache_connector.py +``` + +Example: +```bash +python llm_kv_cache_connector.py meta-llama/Llama-3.1-8B-Instruct +``` + +**Implementation Notes:** + +- This example uses content-based hashing to identify cache blocks +- Cache files are stored in a temporary directory (cleaned up after the demo) +- The implementation is simplified and not optimized for production use +- Does not support chunked prefill in this example +- See `tensorrt_llm/_torch/pyexecutor/kv_cache_connector.py` for the full connector interface + +**NOTE:** This example connector implementation is designed for demonstration purposes +and is NOT suitable for production use without additional optimizations and error handling. +''' import os import sys @@ -17,11 +95,6 @@ from tensorrt_llm.bindings.internal.batch_manager import LlmRequest from tensorrt_llm.llmapi.llm_args import KvCacheConnectorConfig, TorchLlmArgs -# This is a simple example of the use of the KV cache connector. -# It persists KV cache contents into a folder, and can load them back on subsequent runs. -# See tensorrt_llm/_torch/pyexecutor/connector.py for details about the KV cache connector interface. -# NOTE: This example connector implementation is NOT suitable for production use. - CONNECTOR_CACHE_FOLDER_KEY = "CONNECTOR_CACHE_FOLDER" @@ -198,6 +271,7 @@ def main(model: str): this_module = __file__[__file__.rfind("/") + 1:__file__.rfind(".py")] + # --- KV Cache Connector Config --- kv_connector_config = KvCacheConnectorConfig( connector_module=this_module, connector_scheduler_class="PersistentKvCacheConnectorLeader", @@ -207,6 +281,7 @@ def main(model: str): connector_cache_dir = TemporaryDirectory() os.environ[CONNECTOR_CACHE_FOLDER_KEY] = connector_cache_dir.name + # Create LLM instance with KV Cache Connector llm = LLM(model=model, backend="pytorch", cuda_graph_config=None, @@ -220,6 +295,7 @@ def main(model: str): sampling_params = SamplingParams(max_tokens=32) + # Generate text with the first LLM instance and save the kv cache blocks by the connector. output = llm.generate([test_text], sampling_params) text0 = output[0].outputs[0].text @@ -228,16 +304,19 @@ def main(model: str): del llm + # Create a new LLM instance with the same connector configuration llm = LLM(model=model, backend="pytorch", cuda_graph_config=None, kv_connector_config=kv_connector_config) + # Generate text with the second LLM instance and it should reuse the kv cache blocks from the connector. output = llm.generate([test_text], sampling_params) text1 = output[0].outputs[0].text print("Second output (using connector cache): ", text1) + # Verify that the two outputs are identical assert text0 == text1 connector_cache_dir.cleanup() From 96ea4ffa75e3c5672957a6e482b00d86d2a7a033 Mon Sep 17 00:00:00 2001 From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com> Date: Mon, 17 Nov 2025 15:11:44 +0800 Subject: [PATCH 22/26] [https://nvbugs/5649826][fix] Unwaive test test_llm_commandr_plus_4gpus_summary (#9201) Signed-off-by: qgai Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/test_lists/qa/llm_function_core.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index 69c24f0f5ee..b7303bc026f 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -14,8 +14,8 @@ examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-ena examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only] examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only] examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only] -examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (120) -examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (120) +examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disable_weight_only] TIMEOUT (180) +examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] TIMEOUT (180) examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8] examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1] examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2] From a94778a0cb6bc01dec2ca8569285fbebfe5448a3 Mon Sep 17 00:00:00 2001 From: sunnyqgg <159101675+sunnyqgg@users.noreply.github.com> Date: Tue, 18 Nov 2025 19:20:07 +0800 Subject: [PATCH 23/26] [https://nvbugs/5461796][fix] Unwaive and extend time for test_llmapi_speculative_decoding_mtp (#9092) Signed-off-by: qgai Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/integration/test_lists/test-db/l0_sanity_check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/test_lists/test-db/l0_sanity_check.yml b/tests/integration/test_lists/test-db/l0_sanity_check.yml index 68ec272f03d..894bc21b1e7 100644 --- a/tests/integration/test_lists/test-db/l0_sanity_check.yml +++ b/tests/integration/test_lists/test-db/l0_sanity_check.yml @@ -25,7 +25,7 @@ l0_sanity_check: - llmapi/test_llm_examples.py::test_llmapi_example_multilora - llmapi/test_llm_examples.py::test_llmapi_example_guided_decoding - llmapi/test_llm_examples.py::test_llmapi_example_logits_processor - - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp + - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_mtp TIMEOUT (90) - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_eagle3 - llmapi/test_llm_examples.py::test_llmapi_speculative_decoding_ngram - llmapi/test_llm_examples.py::test_llmapi_sampling From ef4a779a69d25a652cc9d3e2431c9b1e82c6b59b Mon Sep 17 00:00:00 2001 From: QI JUN <22017000+QiJune@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:36:01 +0800 Subject: [PATCH 24/26] [TRTLLM-9092][doc] Add a pre-quantized example in quick start guide (#9223) Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- docs/source/quick-start-guide.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/docs/source/quick-start-guide.md b/docs/source/quick-start-guide.md index 4d70b2eba84..2c172f66c01 100644 --- a/docs/source/quick-start-guide.md +++ b/docs/source/quick-start-guide.md @@ -24,6 +24,13 @@ To start the server, you can run a command like the following example inside a D trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" ``` +You may also deploy pre-quantized models to improve performance. +Ensure your GPU supports FP8 quantization before running the following: + +```bash +trtllm-serve "nvidia/Qwen3-8B-FP8" +``` + ```{note} If you are running `trtllm-serve` inside a Docker container, you have two options for sending API requests: 1. Expose a port (e.g., 8000) to allow external access to the server from outside the container. From e413c8039f65547aa2095c8ef6c992cb28619815 Mon Sep 17 00:00:00 2001 From: dominicshanshan <30051912+dominicshanshan@users.noreply.github.com> Date: Wed, 19 Nov 2025 09:46:02 +0800 Subject: [PATCH 25/26] [https://nvbugs/5648685][fix] Fix openAI server waiting time to avoid large model weight loading out time (#9254) Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tests/unittest/llmapi/apps/openai_server.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unittest/llmapi/apps/openai_server.py b/tests/unittest/llmapi/apps/openai_server.py index 39c9988d9f5..b3fde6b94c5 100644 --- a/tests/unittest/llmapi/apps/openai_server.py +++ b/tests/unittest/llmapi/apps/openai_server.py @@ -16,7 +16,7 @@ class RemoteOpenAIServer: DUMMY_API_KEY = "tensorrt_llm" - MAX_SERVER_START_WAIT_S = 600 # wait for server to start for 600 seconds + MAX_SERVER_START_WAIT_S = 7200 # wait for server to start for 7200 seconds (~ 2 hours) for LLM models weight loading def __init__(self, model: str, From 6d4a5fc7617b1fd9bd041551947a50f0e857a9f6 Mon Sep 17 00:00:00 2001 From: JunyiXu-nv <219237550+JunyiXu-nv@users.noreply.github.com> Date: Thu, 20 Nov 2025 19:31:35 +0800 Subject: [PATCH 26/26] =?UTF-8?q?[https://nvbugs/5670793][fix]=20Solve=20t?= =?UTF-8?q?rtllm-serve=20launch=5Fdisaggregated=E2=80=A6=20(#9324)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Junyi Xu <219237550+JunyiXu-nv@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine --- tensorrt_llm/commands/serve.py | 8 ++++---- tensorrt_llm/llmapi/mpi_session.py | 7 +++++++ 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/tensorrt_llm/commands/serve.py b/tensorrt_llm/commands/serve.py index 372808bc0ca..7c10b7a2377 100644 --- a/tensorrt_llm/commands/serve.py +++ b/tensorrt_llm/commands/serve.py @@ -29,7 +29,7 @@ parse_disagg_config_file, parse_metadata_server_config_file) from tensorrt_llm.llmapi.llm_utils import update_llm_args_with_extra_dict -from tensorrt_llm.llmapi.mpi_session import find_free_port +from tensorrt_llm.llmapi.mpi_session import find_free_ipc_addr from tensorrt_llm.llmapi.reasoning_parser import ReasoningParserFactory from tensorrt_llm.logger import logger, severity_map from tensorrt_llm.serve import OpenAIDisaggServer, OpenAIServer @@ -730,10 +730,10 @@ def _launch_disaggregated_leader(sub_comm, instance_idx: int, config_file: str, # This mimics the behavior of trtllm-llmapi-launch # TODO: Make the port allocation atomic - free_port = find_free_port() + free_ipc_addr = find_free_ipc_addr() os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS] = "1" - os.environ[LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR. - value] = f"tcp://127.0.0.1:{free_port}" + os.environ[ + LlmLauncherEnvs.TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR.value] = free_ipc_addr os.environ[DisaggLauncherEnvs.TLLM_DISAGG_RUN_REMOTE_MPI_SESSION_CLIENT. value] = "1" os.environ[DisaggLauncherEnvs.TLLM_DISAGG_INSTANCE_IDX] = str(instance_idx) diff --git a/tensorrt_llm/llmapi/mpi_session.py b/tensorrt_llm/llmapi/mpi_session.py index f0275d7f90a..d32e5a7b7aa 100644 --- a/tensorrt_llm/llmapi/mpi_session.py +++ b/tensorrt_llm/llmapi/mpi_session.py @@ -541,6 +541,13 @@ def find_free_port() -> int: return s.getsockname()[1] +def find_free_ipc_addr() -> str: + import os + import tempfile + import uuid + return f'ipc://{os.path.join(tempfile.gettempdir(), "rpc_" + str(uuid.uuid4()))}' + + def get_mpi_world_size() -> int: # avoid cyclic import from ..executor.utils import get_spawn_proxy_process_env