@@ -264,9 +264,12 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
264264
265265 @skip_pre_hopper
266266 def test_ngram (self ):
267+ max_bs = 16
268+
267269 pytorch_config = dict (
268270 disable_overlap_scheduler = True ,
269- cuda_graph_config = CudaGraphConfig (batch_sizes = [1 ]),
271+ cuda_graph_config = CudaGraphConfig (
272+ batch_sizes = [i for i in range (1 , max_bs + 1 )]),
270273 )
271274
272275 kv_cache_config = KvCacheConfig (enable_block_reuse = False ,
@@ -284,9 +287,7 @@ def test_ngram(self):
284287 ** pytorch_config ,
285288 kv_cache_config = kv_cache_config ,
286289 speculative_config = spec_config ,
287- max_batch_size = 16 ) as llm :
288- task = MMLU (self .MODEL_NAME )
289- task .evaluate (llm )
290+ max_batch_size = max_bs ) as llm :
290291 task = GSM8K (self .MODEL_NAME )
291292 task .evaluate (llm )
292293
@@ -593,7 +594,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
593594 speculative_model_dir = eagle_model_dir ,
594595 eagle3_one_model = eagle3_one_model )
595596 pytorch_config = dict (
596- disable_overlap_scheduler = True ,
597+ disable_overlap_scheduler = not eagle3_one_model ,
597598 cuda_graph_config = CudaGraphConfig (max_batch_size = 1 ))
598599 with LLM (model_path ,
599600 max_batch_size = 16 ,
@@ -1274,6 +1275,25 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
12741275 task = GSM8K (self .MODEL_NAME )
12751276 task .evaluate (llm )
12761277
1278+ @pytest .mark .skip_less_device_memory (60000 )
1279+ def test_bfloat16_2_model_mtp (self ):
1280+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.5 )
1281+ pytorch_config = dict (
1282+ disable_overlap_scheduler = True ,
1283+ cuda_graph_config = CudaGraphConfig (),
1284+ )
1285+ mtp_config = MTPDecodingConfig (num_nextn_predict_layers = 3 ,
1286+ mtp_eagle_one_model = False ,
1287+ speculative_model_dir = self .MODEL_PATH )
1288+ with LLM (self .MODEL_PATH ,
1289+ kv_cache_config = kv_cache_config ,
1290+ enable_chunked_prefill = False ,
1291+ max_num_tokens = 8192 ,
1292+ ** pytorch_config ,
1293+ speculative_config = mtp_config ) as llm :
1294+ task = GSM8K (self .MODEL_NAME )
1295+ task .evaluate (llm )
1296+
12771297 @pytest .mark .skip_less_device (4 )
12781298 @parametrize_with_ids ("torch_compile" , [False , True ])
12791299 @parametrize_with_ids ("attention_dp,cuda_graph,overlap_scheduler" ,
@@ -1933,14 +1953,19 @@ def test_chunked_prefill(self, quant_dtype, kv_cache_reuse, fp8kv,
19331953
19341954 @parametrize_with_ids ("mtp_nextn" ,
19351955 [0 , pytest .param (2 , marks = skip_pre_hopper )])
1956+ @parametrize_with_ids ("use_one_model" , [False , True ])
19361957 @pytest .mark .parametrize ("backend" , ["xgrammar" , "llguidance" ])
1937- def test_guided_decoding (self , backend : str , mtp_nextn : int , mocker ):
1958+ def test_guided_decoding (self , backend : str , mtp_nextn : int ,
1959+ use_one_model : bool , mocker ):
19381960 mocker .patch .dict (os .environ , {"TRTLLM_XGUIDANCE_LENIENT" : "1" })
19391961 kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.75 )
19401962 cuda_graph_config = CudaGraphConfig (enable_padding = True )
19411963 mtp_config = None
19421964 if mtp_nextn > 0 :
1943- mtp_config = MTPDecodingConfig (num_nextn_predict_layers = mtp_nextn )
1965+ mtp_config = MTPDecodingConfig (
1966+ num_nextn_predict_layers = mtp_nextn ,
1967+ mtp_eagle_one_model = use_one_model ,
1968+ speculative_model_dir = self .MODEL_PATH )
19441969 llm = LLM (self .MODEL_PATH ,
19451970 guided_decoding_backend = backend ,
19461971 kv_cache_config = kv_cache_config ,
@@ -3205,31 +3230,6 @@ def test_nvfp4(
32053230 task = GSM8K (self .MODEL_NAME )
32063231 task .evaluate (llm )
32073232
3208- def test_eagle3 (self ):
3209- pytorch_config = dict (
3210- disable_overlap_scheduler = False ,
3211- cuda_graph_config = CudaGraphConfig (batch_sizes = [1 , 2 , 3 , 4 , 8 ]),
3212- )
3213- kv_cache_config = KvCacheConfig (enable_block_reuse = False )
3214-
3215- eagle_model_dir = f"{ llm_models_root ()} /Qwen3/Qwen3-30B-eagle3"
3216- target_model_dir = f"{ llm_models_root ()} /Qwen3/Qwen3-30B-A3B"
3217-
3218- draft_len = 1
3219- spec_config = EagleDecodingConfig (max_draft_len = draft_len ,
3220- speculative_model_dir = eagle_model_dir ,
3221- eagle3_one_model = True )
3222-
3223- llm = LLM (model = target_model_dir ,
3224- ** pytorch_config ,
3225- kv_cache_config = kv_cache_config ,
3226- speculative_config = spec_config ,
3227- max_seq_len = 8192 )
3228-
3229- with llm :
3230- task = GSM8K (self .MODEL_NAME )
3231- task .evaluate (llm )
3232-
32333233 @pytest .mark .parametrize ("moe_backend" , ["CUTLASS" , "TRITON" , "TRTLLM" ])
32343234 @pytest .mark .parametrize (
32353235 "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler" , [
0 commit comments