Address review comments from tcherckez

nzmora-nvidia · nzmora-nvidia · commit 37e2b64e62e2 · 2025-11-30T05:30:43.000-08:00
Signed-off-by: Neta Zmora &lt;96238833+nzmora-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
@@ -273,14 +273,14 @@ def trtllm_quant_nvfp4_moe_fused(
     w3_blockscale_fp8: torch.Tensor,  # Block scale for w3 (fp8 )
     fc1_act_global: torch.Tensor,  # Global scale for FC1 activations
     fc2_act_global: torch.Tensor,  # Global scale for FC2 activations
-    fc1_global: Optional[
+    fc1_alpha: Optional[
         torch.Tensor
     ] = None,  # Precomputed global scale for FC1 (1.0 / (fc1_act_global * fc1_weight_gs))
-    fc2_global: Optional[
+    fc2_alpha: Optional[
         torch.Tensor
     ] = None,  # Precomputed global scale for FC2 (1.0 / (fc2_act_global * fc2_weight_gs))
     input_blockscale: Optional[torch.Tensor] = None,  # Input scale factors for NVFP4 input
-    output_dtype: Optional[torch.dtype] = None,  # Output dtype for NVFP4 input
+    output_dtype: Optional[torch.dtype] = None,  # determines output dtype when input is NVFP4
     mlp_style: str = "gated_mlp",
     act_fn: str = "silu",
 ) -> torch.Tensor:
@@ -322,16 +322,16 @@ def trtllm_quant_nvfp4_moe_fused(
 
     fc2_weight_block_scale = w2_blockscale_fp8
     fc2_weight_gs = w2_global_scale
-    fc1_global = 1.0 / (fc1_act_global * fc1_weight_gs) if fc1_global is None else fc1_global
-    fc2_global = 1.0 / (fc2_act_global * fc2_weight_gs) if fc2_global is None else fc2_global
+    fc1_alpha = 1.0 / (fc1_act_global * fc1_weight_gs) if fc1_alpha is None else fc1_alpha
+    fc2_alpha = 1.0 / (fc2_act_global * fc2_weight_gs) if fc2_alpha is None else fc2_alpha
 
     quant_scales = [
         fc1_act_global,
         fc1_weight_blockscale.view(torch.int32),
-        fc1_global,
+        fc1_alpha,
         fc2_act_global,
         fc2_weight_block_scale.view(torch.int32),
-        fc2_global,
+        fc2_alpha,
     ]
 
     if x.dtype in (torch.float16, torch.bfloat16):
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py
@@ -229,9 +229,6 @@ def test_trtllm_fused_moe(
         activation_func=activation_func,
     )
 
-    torch.cuda.synchronize()
-    print("before fused_moe.cutlass_fused_moe")
-
     assert itype == torch.bfloat16 or itype == torch.float16, (
         "F16 test only supports bfloat16 or float16"
     )
@@ -256,6 +253,7 @@ def get_fc1_expert_weights(
     _, w1_weight = torch.chunk(w31_weight, 2, dim=1)
     mlp_style = "mlp" if activation_func == "relu2" else "gated_mlp"
 
+    torch.cuda.synchronize()
     ad_test_output = torch.ops.auto_deploy.trtllm_moe_fused(
         x,
         selected_experts.to(torch.int),
@@ -500,7 +498,7 @@ def act(weight, mask):
                 inter_gs,
                 dtype=inter.dtype,
                 device=inter.device,
-                block_size=16,
+                block_size=NVFP4_BLOCK_SIZE,
             ).cuda()
             out[mask] = inter @ w2[i].transpose(0, 1)
     return (out.view(B, -1, w2.shape[1]) * topk_weight.view(B, -1, 1).to(out.dtype)).sum(dim=1)
@@ -565,6 +563,7 @@ def break_fp4_bytes(a, dtype):
 ]
 
 
+@pytest.mark.parametrize("precompute_fc_alphas", [True, False])
 @pytest.mark.parametrize("batch_size", BATCH_SIZES)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("num_experts", NUM_EXPERTS)
@@ -579,6 +578,7 @@ def break_fp4_bytes(a, dtype):
     reason="Requires fp4 and trtllm support",
 )
 def test_trtllm_fused_moe_nvfp4(
+    precompute_fc_alphas,
     batch_size,
     hidden_size,
     num_experts,
@@ -693,13 +693,13 @@ def round_up(x, y):
 
     routing_weights, selected_experts = compute_routing(router_logits, top_k)
 
-    if True:
+    if precompute_fc_alphas:
         fc1_weight_gs = torch.max(w3_gs, w1_gs)
-        fc1_global = 1.0 / (fc1_act_global * fc1_weight_gs)
-        fc2_global = 1.0 / (fc2_act_global * w2_gs)
+        fc1_alpha = 1.0 / (fc1_act_global * fc1_weight_gs)
+        fc2_alpha = 1.0 / (fc2_act_global * w2_gs)
     else:
-        fc1_global = None
-        fc2_global = None
+        fc1_alpha = None
+        fc2_alpha = None
 
     mlp_style = "mlp" if activation_func == "relu2" else "gated_mlp"
     trtllm_output = torch.ops.auto_deploy.trtllm_quant_nvfp4_moe_fused(
@@ -717,8 +717,8 @@ def round_up(x, y):
         w3_blockscale,
         fc1_act_global,
         fc2_act_global,
-        fc1_global=fc1_global,
-        fc2_global=fc2_global,
+        fc1_alpha=fc1_alpha,
+        fc2_alpha=fc2_alpha,
         input_blockscale=None,
         output_dtype=otype,
         mlp_style=mlp_style,