Skip to content

Commit 3743cf4

Browse files
committed
add qwen3-next CI test of accuracy on BF16 and NVFP4
Signed-off-by: jiant <[email protected]>
1 parent 79a6c97 commit 3743cf4

File tree

4 files changed

+90
-0
lines changed

4 files changed

+90
-0
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ Qwen3/Qwen3-235B-A22B:
138138
accuracy: 85.78
139139
Qwen3/Qwen3-Next-80B-A3B-Thinking:
140140
- accuracy: 81.577
141+
Qwen3/Qwen3-Next-80B-A3B-Instruct:
142+
- accuracy: 84.42
143+
- quant_algo: NVFP4
144+
kv_cache_quant_algo: FP8
145+
accuracy: 84.32
141146
moonshotai/Kimi-K2-Instruct:
142147
- quant_algo: FP8_BLOCK_SCALES
143148
accuracy: 94.84

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ Qwen3/Qwen3-235B-A22B:
242242
accuracy: 86
243243
Qwen3/Qwen3-Next-80B-A3B-Thinking:
244244
- accuracy: 86
245+
Qwen3/Qwen3-Next-80B-A3B-Instruct:
246+
- accuracy: 85.58
247+
- quant_algo: NVFP4
248+
kv_cache_quant_algo: FP8
249+
accuracy: 85
245250
moonshotai/Kimi-K2-Instruct:
246251
- quant_algo: FP8_BLOCK_SCALES
247252
accuracy: 87.65

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4118,6 +4118,80 @@ def test_auto_dtype(self, tp_size, pp_size, ep_size):
41184118
task.evaluate(llm)
41194119

41204120

4121+
@skip_pre_hopper
4122+
@pytest.mark.skip_less_device_memory(80000)
4123+
class TestQwen3NextInstruct(LlmapiAccuracyTestHarness):
4124+
MODEL_PATH = f"{llm_models_root()}/Qwen3-Next"
4125+
MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Instruct"
4126+
4127+
@pytest.mark.skip_less_device(4)
4128+
@pytest.mark.parametrize(
4129+
"tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler",
4130+
[
4131+
(4, 1, 4, True, True),
4132+
],
4133+
ids=[
4134+
"tp4ep4_cudagraph_overlap",
4135+
],
4136+
)
4137+
def test_bf16_4gpu(self, tp_size, pp_size, ep_size, cuda_graph,
4138+
overlap_scheduler):
4139+
model_path = f"{self.MODEL_PATH}/Qwen3-Next-80B-A3B-Instruct"
4140+
model_path = "Qwen/Qwen3-Next-80B-A3B-Instruct"
4141+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
4142+
enable_block_reuse=False)
4143+
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
4144+
cuda_graph_config=CudaGraphConfig(
4145+
max_batch_size=512) if cuda_graph else None)
4146+
4147+
with LLM(
4148+
model_path,
4149+
tensor_parallel_size=tp_size,
4150+
max_num_tokens=16384,
4151+
pipeline_parallel_size=pp_size,
4152+
moe_expert_parallel_size=ep_size,
4153+
kv_cache_config=kv_cache_config,
4154+
**pytorch_config,
4155+
) as llm:
4156+
task = MMLU(self.MODEL_NAME)
4157+
task.evaluate(llm)
4158+
task = GSM8K(self.MODEL_NAME)
4159+
task.evaluate(llm)
4160+
4161+
@skip_pre_blackwell
4162+
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM"],
4163+
ids=["cutlass", "trtllm"])
4164+
@pytest.mark.parametrize(
4165+
"tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler",
4166+
[(1, 1, 1, True, True), (4, 1, 1, True, True), (4, 1, 4, True, True),
4167+
(4, 1, 4, False, False)],
4168+
ids=["tp1", "tp4ep1", "tp4ep4", "no_cuda_graph_overlap"])
4169+
def test_nvfp4(self, moe_backend, tp_size, pp_size, ep_size, cuda_graph,
4170+
overlap_scheduler):
4171+
model_path = f"{self.MODEL_PATH}/qwen3-next-80b-instruct-nvfp4-ptq-fp8kv"
4172+
model_path = "/home/scratch.didow_sw_1/models/qwen3-next-80b-instruct-nvfp4-ptq-fp8kv"
4173+
4174+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
4175+
enable_block_reuse=False)
4176+
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
4177+
cuda_graph_config=CudaGraphConfig(
4178+
max_batch_size=512) if cuda_graph else None)
4179+
moe_config = MoeConfig(backend=moe_backend)
4180+
4181+
with LLM(model_path,
4182+
tensor_parallel_size=tp_size,
4183+
max_num_tokens=16384,
4184+
pipeline_parallel_size=pp_size,
4185+
moe_expert_parallel_size=ep_size,
4186+
kv_cache_config=kv_cache_config,
4187+
**pytorch_config,
4188+
moe_config=moe_config) as llm:
4189+
task = MMLU(self.MODEL_NAME)
4190+
task.evaluate(llm)
4191+
task = GSM8K(self.MODEL_NAME)
4192+
task.evaluate(llm)
4193+
4194+
41214195
class TestSeedOss_36B(LlmapiAccuracyTestHarness):
41224196
MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
41234197
MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -59,6 +59,12 @@ l0_dgx_b200:
5959
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp2pp2
6060
- accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
6161
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
62+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap]
63+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
64+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass]
65+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
66+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
67+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
6268
- condition:
6369
ranges:
6470
system_gpu_count:

0 commit comments

Comments
 (0)