Adding new MoE e2e tests (#1960)

HDCharles · web-flow · commit 560bb9cb62be · 2025-11-14T17:31:16.000-05:00
SUMMARY: adding e2e tests for MoE also small fix so scheme: None doesn't error and by default ignore expert gate layers (model dependent on whether its supported) TEST PLAN: in progress: https://github.com/neuralmagic/llm-compressor-testing/actions/runs/19368818055 local (after disabling the [cadence skip](https://github.com/vllm-project/llm-compressor/blob/main/tests/e2e/vLLM/test_vllm.py)) export TEST_DATA_FILE="${REPOS}/llm-compressor/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_tensor.yaml" pytest tests/e2e/vLLM/test_vllm.py -vs 2>&1 | tee log-fp8.log export TEST_DATA_FILE="${REPOS}/llm-compressor/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml" pytest tests/e2e/vLLM/test_vllm.py -vs 2>&1 | tee log-fp4.log --------- Signed-off-by: HDCharles <charlesdavidhernandez@gmail.com>
diff --git a/tests/e2e/e2e_utils.py b/tests/e2e/e2e_utils.py
@@ -84,11 +84,13 @@ def data_collator(batch):
                 targets="Linear",
                 scheme=scheme,
                 actorder=None,  # added for consistency with past testing configs
-                ignore=["lm_head"],
+                ignore=["lm_head", "re:.*mlp.gate[.].*"],
             )
         else:
             oneshot_kwargs["recipe"] = QuantizationModifier(
-                targets="Linear", scheme=scheme, ignore=["lm_head"]
+                targets="Linear",
+                scheme=scheme,
+                ignore=["lm_head", "re:.*mlp.gate[.].*"],
             )
 
     # Apply quantization.
diff --git a/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml b/tests/e2e/vLLM/configs/qwen3_fp4_nvfp4.yaml
@@ -0,0 +1,7 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen3-30B-A3B
+scheme: NVFP4
+dataset_id: HuggingFaceH4/ultrachat_200k
+dataset_split: train_sft
+num_calibration_samples: 20
diff --git a/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml b/tests/e2e/vLLM/configs/qwen3_fp8_dynamic_per_token.yaml
@@ -0,0 +1,4 @@
+cadence: "nightly"
+test_type: "regression"
+model: Qwen/Qwen3-30B-A3B
+scheme: FP8_DYNAMIC
diff --git a/tests/e2e/vLLM/run_vllm.py b/tests/e2e/vLLM/run_vllm.py
@@ -18,7 +18,7 @@ def parse_args():
     except json.JSONDecodeError as e:
         raise ValueError(f"Invalid JSON input: {e}")
 
-    if "W4A16_2of4" in scheme:
+    if scheme is not None and "W4A16_2of4" in scheme:
         # required by the kernel
         llm_kwargs["dtype"] = torch.float16