@@ -4118,6 +4118,80 @@ def test_auto_dtype(self, tp_size, pp_size, ep_size):
41184118 task .evaluate (llm )
41194119
41204120
4121+ @skip_pre_hopper
4122+ @pytest .mark .skip_less_device_memory (80000 )
4123+ class TestQwen3NextInstruct (LlmapiAccuracyTestHarness ):
4124+ MODEL_PATH = f"{ llm_models_root ()} /Qwen3-Next"
4125+ MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Instruct"
4126+
4127+ @pytest .mark .skip_less_device (4 )
4128+ @pytest .mark .parametrize (
4129+ "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler" ,
4130+ [
4131+ (4 , 1 , 4 , True , True ),
4132+ ],
4133+ ids = [
4134+ "tp4ep4_cudagraph_overlap" ,
4135+ ],
4136+ )
4137+ def test_bf16_4gpu (self , tp_size , pp_size , ep_size , cuda_graph ,
4138+ overlap_scheduler ):
4139+ model_path = f"{ self .MODEL_PATH } /Qwen3-Next-80B-A3B-Instruct"
4140+ model_path = "Qwen/Qwen3-Next-80B-A3B-Instruct"
4141+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
4142+ enable_block_reuse = False )
4143+ pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
4144+ cuda_graph_config = CudaGraphConfig (
4145+ max_batch_size = 512 ) if cuda_graph else None )
4146+
4147+ with LLM (
4148+ model_path ,
4149+ tensor_parallel_size = tp_size ,
4150+ max_num_tokens = 16384 ,
4151+ pipeline_parallel_size = pp_size ,
4152+ moe_expert_parallel_size = ep_size ,
4153+ kv_cache_config = kv_cache_config ,
4154+ ** pytorch_config ,
4155+ ) as llm :
4156+ task = MMLU (self .MODEL_NAME )
4157+ task .evaluate (llm )
4158+ task = GSM8K (self .MODEL_NAME )
4159+ task .evaluate (llm )
4160+
4161+ @skip_pre_blackwell
4162+ @pytest .mark .parametrize ("moe_backend" , ["CUTLASS" , "TRTLLM" ],
4163+ ids = ["cutlass" , "trtllm" ])
4164+ @pytest .mark .parametrize (
4165+ "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler" ,
4166+ [(1 , 1 , 1 , True , True ), (4 , 1 , 1 , True , True ), (4 , 1 , 4 , True , True ),
4167+ (4 , 1 , 4 , False , False )],
4168+ ids = ["tp1" , "tp4ep1" , "tp4ep4" , "no_cuda_graph_overlap" ])
4169+ def test_nvfp4 (self , moe_backend , tp_size , pp_size , ep_size , cuda_graph ,
4170+ overlap_scheduler ):
4171+ model_path = f"{ self .MODEL_PATH } /qwen3-next-80b-instruct-nvfp4-ptq-fp8kv"
4172+ model_path = "/home/scratch.didow_sw_1/models/qwen3-next-80b-instruct-nvfp4-ptq-fp8kv"
4173+
4174+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
4175+ enable_block_reuse = False )
4176+ pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
4177+ cuda_graph_config = CudaGraphConfig (
4178+ max_batch_size = 512 ) if cuda_graph else None )
4179+ moe_config = MoeConfig (backend = moe_backend )
4180+
4181+ with LLM (model_path ,
4182+ tensor_parallel_size = tp_size ,
4183+ max_num_tokens = 16384 ,
4184+ pipeline_parallel_size = pp_size ,
4185+ moe_expert_parallel_size = ep_size ,
4186+ kv_cache_config = kv_cache_config ,
4187+ ** pytorch_config ,
4188+ moe_config = moe_config ) as llm :
4189+ task = MMLU (self .MODEL_NAME )
4190+ task .evaluate (llm )
4191+ task = GSM8K (self .MODEL_NAME )
4192+ task .evaluate (llm )
4193+
4194+
41214195class TestSeedOss_36B (LlmapiAccuracyTestHarness ):
41224196 MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
41234197 MODEL_PATH = f"{ llm_models_root ()} /Seed-OSS/Seed-OSS-36B-Instruct"
0 commit comments