[https://nvbugs/5518713][fix] Trtllm-gen moe backend for blockwise fp8 ckpt (Qwen3-235B-A22B-FP8) (#7856)

jhaotingc · web-flow · commit c33f43e13a1c · 2025-09-26T14:29:32.000-07:00
Signed-off-by: Jhao-Ting Chen &lt;jhaotingc@nvidia.com&gt;
diff --git a/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp8BlockScaleMoe.cpp
@@ -115,7 +115,8 @@ at::Tensor run_fp8_block_scale_moe(at::optional<at::Tensor> const& routing_logit
     else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Renormalize
         || static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::RenormalizeNaive)
     {
-        TORCH_CHECK(false, "Don't support this routing method type Renormalize(Naive).");
+        TORCH_CHECK(top_k <= 8 && top_k > 0,
+            "Current routing kernel (no groups, renormalize) only supports top_k<=8 && top_k>0.");
     }
     else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4)
     {
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -395,12 +395,12 @@ def __init__(
         self,
         num_experts: int,
         top_k: int,
-        n_group: int,
-        topk_group: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
-        routed_scaling_factor: float,
+        routed_scaling_factor: Optional[float],
         routing_method_type: int,
     ):
 
@@ -562,12 +562,12 @@ def fp8_block_scale_moe_runner(
         gemm2_weights_scale: torch.Tensor,
         num_experts: int,
         top_k: int,
-        n_group: int,
-        topk_group: int,
+        n_group: Optional[int],
+        topk_group: Optional[int],
         intermediate_size: int,
         local_expert_offset: int,
         local_num_experts: int,
-        routed_scaling_factor: float,
+        routed_scaling_factor: Optional[float],
         routing_method_type: int,
         topk_weights: Optional[torch.Tensor] = None,
         topk_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
@@ -630,12 +630,12 @@ def _(routing_logits: torch.Tensor,
       gemm2_weights_scale: torch.Tensor,
       num_experts: int,
       top_k: int,
-      n_group: int,
-      topk_group: int,
+      n_group: Optional[int],
+      topk_group: Optional[int],
       intermediate_size: int,
       local_expert_offset: int,
       local_num_experts: int,
-      routed_scaling_factor: float,
+      routed_scaling_factor: Optional[float],
       routing_method_type: int,
       topk_weights: Optional[torch.Tensor] = None,
       topk_ids: Optional[torch.Tensor] = None) -> torch.Tensor:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2959,6 +2959,38 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph,
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
 
+    @skip_pre_hopper
+    @pytest.mark.skip_less_device(8)
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler,moe_backend",
+        [(8, 1, 8, True, True, True, "DEEPGEMM"),
+         (8, 1, 8, False, True, True, "DEEPGEMM"),
+         (8, 1, 8, True, True, True, "TRTLLM"),
+         (8, 1, 8, False, True, True, "TRTLLM")],
+        ids=[
+            "latency_deepgemm", "throughput_latency_deepgemm", "latency_trtllm",
+            "throughput_latency_trtllm"
+        ])
+    def test_fp8_block_scales(self, tp_size, pp_size, ep_size, attention_dp,
+                              cuda_graph, overlap_scheduler, moe_backend):
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(backend=moe_backend))
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+        with LLM(f"{llm_models_root()}/Qwen3/Qwen3-235B-A22B-FP8",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp,
+                 kv_cache_config=kv_cache_config) as llm:
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @skip_pre_blackwell
     @pytest.mark.skip_less_mpi_world_size(8)
     @pytest.mark.parametrize(

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,8 @@ at::Tensor run_fp8_block_scale_moe(at::optional<at::Tensor> const& routing_logit`
`115`	`115`	`else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Renormalize`
`116`	`116`	`\|\| static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::RenormalizeNaive)`
`117`	`117`	`{`
`118`		`- TORCH_CHECK(false, "Don't support this routing method type Renormalize(Naive).");`
	`118`	`+ TORCH_CHECK(top_k <= 8 && top_k > 0,`
	`119`	`+ "Current routing kernel (no groups, renormalize) only supports top_k<=8 && top_k>0.");`
`119`	`120`	`}`
`120`	`121`	`else if (static_cast<RoutingMethodType>(routing_method_type) == RoutingMethodType::Llama4)`
`121`	`122`	`{`