Skip to content

Commit 90fd3cd

Browse files
committed
[TRTLLM-8980][test] Clean up spec dec tests in test_llm_api_pytorch
Signed-off-by: Mike Iovine <[email protected]>
1 parent f2ebaf2 commit 90fd3cd

File tree

4 files changed

+38
-34
lines changed

4 files changed

+38
-34
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 32 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -264,9 +264,12 @@ def test_eagle3(self, overlap_scheduler, eagle3_one_model):
264264

265265
@skip_pre_hopper
266266
def test_ngram(self):
267+
max_bs = 16
268+
267269
pytorch_config = dict(
268270
disable_overlap_scheduler=True,
269-
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
271+
cuda_graph_config=CudaGraphConfig(
272+
batch_sizes=[i for i in range(1, max_bs + 1)]),
270273
)
271274

272275
kv_cache_config = KvCacheConfig(enable_block_reuse=False,
@@ -284,9 +287,7 @@ def test_ngram(self):
284287
**pytorch_config,
285288
kv_cache_config=kv_cache_config,
286289
speculative_config=spec_config,
287-
max_batch_size=16) as llm:
288-
task = MMLU(self.MODEL_NAME)
289-
task.evaluate(llm)
290+
max_batch_size=max_bs) as llm:
290291
task = GSM8K(self.MODEL_NAME)
291292
task.evaluate(llm)
292293

@@ -593,7 +594,7 @@ def test_fp8_eagle3_tp8(self, eagle3_one_model):
593594
speculative_model_dir=eagle_model_dir,
594595
eagle3_one_model=eagle3_one_model)
595596
pytorch_config = dict(
596-
disable_overlap_scheduler=True,
597+
disable_overlap_scheduler=not eagle3_one_model,
597598
cuda_graph_config=CudaGraphConfig(max_batch_size=1))
598599
with LLM(model_path,
599600
max_batch_size=16,
@@ -1274,6 +1275,25 @@ def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
12741275
task = GSM8K(self.MODEL_NAME)
12751276
task.evaluate(llm)
12761277

1278+
@pytest.mark.skip_less_device_memory(60000)
1279+
def test_bfloat16_2_model_mtp(self):
1280+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
1281+
pytorch_config = dict(
1282+
disable_overlap_scheduler=True,
1283+
cuda_graph_config=CudaGraphConfig(),
1284+
)
1285+
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=3,
1286+
mtp_eagle_one_model=False,
1287+
speculative_model_dir=self.MODEL_PATH)
1288+
with LLM(self.MODEL_PATH,
1289+
kv_cache_config=kv_cache_config,
1290+
enable_chunked_prefill=False,
1291+
max_num_tokens=8192,
1292+
**pytorch_config,
1293+
speculative_config=mtp_config) as llm:
1294+
task = GSM8K(self.MODEL_NAME)
1295+
task.evaluate(llm)
1296+
12771297
@pytest.mark.skip_less_device(4)
12781298
@parametrize_with_ids("torch_compile", [False, True])
12791299
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
@@ -1933,14 +1953,19 @@ def test_chunked_prefill(self, quant_dtype, kv_cache_reuse, fp8kv,
19331953

19341954
@parametrize_with_ids("mtp_nextn",
19351955
[0, pytest.param(2, marks=skip_pre_hopper)])
1956+
@parametrize_with_ids("use_one_model", [False, True])
19361957
@pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
1937-
def test_guided_decoding(self, backend: str, mtp_nextn: int, mocker):
1958+
def test_guided_decoding(self, backend: str, mtp_nextn: int,
1959+
use_one_model: bool, mocker):
19381960
mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
19391961
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
19401962
cuda_graph_config = CudaGraphConfig(enable_padding=True)
19411963
mtp_config = None
19421964
if mtp_nextn > 0:
1943-
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
1965+
mtp_config = MTPDecodingConfig(
1966+
num_nextn_predict_layers=mtp_nextn,
1967+
mtp_eagle_one_model=use_one_model,
1968+
speculative_model_dir=self.MODEL_PATH)
19441969
llm = LLM(self.MODEL_PATH,
19451970
guided_decoding_backend=backend,
19461971
kv_cache_config=kv_cache_config,
@@ -3205,31 +3230,6 @@ def test_nvfp4(
32053230
task = GSM8K(self.MODEL_NAME)
32063231
task.evaluate(llm)
32073232

3208-
def test_eagle3(self):
3209-
pytorch_config = dict(
3210-
disable_overlap_scheduler=False,
3211-
cuda_graph_config=CudaGraphConfig(batch_sizes=[1, 2, 3, 4, 8]),
3212-
)
3213-
kv_cache_config = KvCacheConfig(enable_block_reuse=False)
3214-
3215-
eagle_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-30B-eagle3"
3216-
target_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B"
3217-
3218-
draft_len = 1
3219-
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
3220-
speculative_model_dir=eagle_model_dir,
3221-
eagle3_one_model=True)
3222-
3223-
llm = LLM(model=target_model_dir,
3224-
**pytorch_config,
3225-
kv_cache_config=kv_cache_config,
3226-
speculative_config=spec_config,
3227-
max_seq_len=8192)
3228-
3229-
with llm:
3230-
task = GSM8K(self.MODEL_NAME)
3231-
task.evaluate(llm)
3232-
32333233
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRITON", "TRTLLM"])
32343234
@pytest.mark.parametrize(
32353235
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
459459
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
460460
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False]
461461
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
462+
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp
462463
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
463464
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
464465
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ l0_b200:
2222
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
2323
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
2424
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=True]
25+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp
2526
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
2627
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] ISOLATION
2728
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
@@ -34,7 +35,8 @@ l0_b200:
3435
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]
3536
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True]
3637
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=nvfp4-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
37-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2]
38+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=False-mtp_nextn=2]
39+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=True-mtp_nextn=2]
3840
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
3941
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
4042
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-trtllm-auto]

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ l0_h100:
6262
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
6363
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=True]
6464
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-enable_chunked_prefill=False]
65+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_2_model_mtp
6566
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=fp8-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
6667
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=True-fp8kv=False-overlap_scheduler=True]
6768
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=none-kv_cache_reuse=False-fp8kv=False-overlap_scheduler=True]
@@ -250,7 +251,7 @@ l0_h100:
250251
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=eagle-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=True-torch_compile=False]
251252
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
252253
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=none-mtp_nextn=2-fp8kv=False-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
253-
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0]
254+
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-use_one_model=True-mtp_nextn=0]
254255
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_fp8_prequantized
255256
- accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_fp8_prequantized
256257
- accuracy/test_llm_api_pytorch.py::TestGemma3_27BInstruct::test_auto_dtype TIMEOUT (90)

0 commit comments

Comments
 (0)