add eagle3 gpt-oss test

jhaotingc · jhaotingc · commit a6b300c54047 · 2025-11-21T09:25:51.000-08:00
Signed-off-by: Jhao-Ting Chen &lt;jhaotingc@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -465,7 +465,7 @@ def run(
             self.spec_decoding_generation_lengths,
             self.spec_decoding_position_offsets, self.spec_decoding_packed_mask
         ]
-        if get_sm_version() >= 100:
+        if get_sm_version() >= 100 and get_sm_version() != 120:
             spec_decoding_tensor_params.append(
                 self.spec_decoding_bl_tree_mask_offset)
             spec_decoding_tensor_params.append(self.spec_decoding_bl_tree_mask)
@@ -1158,8 +1158,8 @@ def update_spec_dec_param(
             spec_decoding_generation_lengths = None
 
         self.is_spec_decoding_enabled = is_spec_decoding_enabled
-        if get_sm_version(
-        ) >= 100 and not is_spec_dec_tree and not is_spec_dec_dynamic_tree:
+        if (get_sm_version() >= 100 and get_sm_version() != 120
+            ) and not is_spec_dec_tree and not is_spec_dec_dynamic_tree:
             self.is_spec_decoding_enabled = False
 
         # use_spec_decoding is default to true by default, change in runtime by layers / requests
@@ -1190,7 +1190,7 @@ def update_spec_dec_param(
                 dtype=torch.int,
                 device='cuda',
             )
-            if get_sm_version() >= 100:
+            if get_sm_version() >= 100 and get_sm_version() != 120:
                 self.spec_decoding_param_prepare_for_blackwell()
             else:
                 self.spec_decoding_bl_tree_mask_offset = None
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3956,12 +3956,14 @@ def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
                           extra_evaluator_kwargs=extra_evaluator_kwargs)
 
     @pytest.mark.skip_less_device(4)
+    @pytest.mark.parametrize("eagle3_one_model", [False, True],
+                             ids=["two_model", "one_model"])
     @pytest.mark.parametrize(
         "moe_backend",
         ["CUTLASS",
          pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
         ids=["cutlass", "trtllm", "triton"])
-    def test_eagle3(self, moe_backend, mocker):
+    def test_eagle3(self, eagle3_one_model, moe_backend, mocker):
         if moe_backend == "TRITON":
             if not IS_TRITON_KERNELS_AVAILABLE:
                 pytest.skip("Triton kernels are not available")
@@ -3976,17 +3978,23 @@ def test_eagle3(self, moe_backend, mocker):
         mocker.patch.object(GPQADiamond, "MAX_OUTPUT_LEN", MAX_OUTPUT_LEN)
         mocker.patch.object(GPQADiamond, "MAX_INPUT_LEN", MAX_INPUT_LEN)
 
-        # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
-        pytorch_config = dict(disable_overlap_scheduler=True,
-                              cuda_graph_config=CudaGraphConfig())
+        if eagle3_one_model:
+            pytorch_config = dict(disable_overlap_scheduler=False,
+                                  max_batch_size=1,
+                                  cuda_graph_config=CudaGraphConfig(
+                                      enable_padding=True, max_batch_size=1))
+        else:
+            # https://nvbugs/5590408: 2-Model overlap scheduling has accuracy issue
+            pytorch_config = dict(disable_overlap_scheduler=True,
+                                  cuda_graph_config=CudaGraphConfig())
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
                                         dtype="auto")
 
         eagle_model_dir = f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3"
         draft_len = 3
         spec_config = EagleDecodingConfig(max_draft_len=draft_len,
                                           speculative_model_dir=eagle_model_dir,
-                                          eagle3_one_model=False)
+                                          eagle3_one_model=eagle3_one_model)
 
         max_seq_len = MAX_INPUT_LEN + MAX_OUTPUT_LEN
         llm = LLM(self.MODEL_PATH,
diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt
@@ -564,9 +564,12 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False-False-False]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True-True-True]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -101,9 +101,12 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model]
 accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
@@ -342,9 +342,12 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-au
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -50,8 +50,10 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
@@ -196,6 +198,8 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-two_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[trtllm-one_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -185,8 +185,10 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass]
-  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-two_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model]
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -109,5 +109,7 @@ l0_rtx_pro_6000:
   # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-one_model]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[triton-one_model]
   - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8]
   - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4]
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -346,7 +346,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_laten
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm_attention_dp] SKIP (https://nvbugs/5637220)
 llmapi/test_llm_examples.py::test_llmapi_example_multilora SKIP (https://nvbugs/5636857)
 unittest/_torch/modules/test_mla_helix.py::test_mla_helix_distributed SKIP (https://nvbugspro.nvidia.com/bug/5637012)
-accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass] SKIP (https://nvbugs/5636916)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3[cutlass-two_model] SKIP (https://nvbugs/5636916)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5616182)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
 examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)