Address review comments

nzmora-nvidia · nzmora-nvidia · commit d44a19936efb · 2025-11-30T05:42:51.000-08:00
Signed-off-by: Neta Zmora &lt;96238833+nzmora-nvidia@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/trtllm_moe.py
@@ -271,8 +271,8 @@ def trtllm_quant_nvfp4_moe_fused(
     w1_blockscale_fp8: torch.Tensor,  # Block scale for w1 (fp8 )
     w2_blockscale_fp8: torch.Tensor,  # Block scale for w2 (fp8 )
     w3_blockscale_fp8: torch.Tensor,  # Block scale for w3 (fp8 )
-    fc1_act_global: torch.Tensor,  # Global scale for FC1 activations
-    fc2_act_global: torch.Tensor,  # Global scale for FC2 activations
+    fc1_act_global_scale: torch.Tensor,  # Global scale for FC1 activations
+    fc2_act_global_scale: torch.Tensor,  # Global scale for FC2 activations
     fc1_alpha: Optional[
         torch.Tensor
     ] = None,  # Precomputed global scale for FC1 (1.0 / (fc1_act_global * fc1_weight_gs))
@@ -322,21 +322,21 @@ def trtllm_quant_nvfp4_moe_fused(
 
     fc2_weight_block_scale = w2_blockscale_fp8
     fc2_weight_gs = w2_global_scale
-    fc1_alpha = 1.0 / (fc1_act_global * fc1_weight_gs) if fc1_alpha is None else fc1_alpha
-    fc2_alpha = 1.0 / (fc2_act_global * fc2_weight_gs) if fc2_alpha is None else fc2_alpha
+    fc1_alpha = 1.0 / (fc1_act_global_scale * fc1_weight_gs) if fc1_alpha is None else fc1_alpha
+    fc2_alpha = 1.0 / (fc2_act_global_scale * fc2_weight_gs) if fc2_alpha is None else fc2_alpha
 
     quant_scales = [
-        fc1_act_global,
+        fc1_act_global_scale,
         fc1_weight_blockscale.view(torch.int32),
         fc1_alpha,
-        fc2_act_global,
+        fc2_act_global_scale,
         fc2_weight_block_scale.view(torch.int32),
         fc2_alpha,
     ]
 
     if x.dtype in (torch.float16, torch.bfloat16):
         x_q_fp4, input_blockscale = torch.ops.trtllm.fp4_quantize(
-            x, fc1_act_global, NVFP4_BLOCK_SIZE
+            x, fc1_act_global_scale, NVFP4_BLOCK_SIZE
         )
         output_dtype = x.dtype
     else:
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/custom_ops/test_trtllm_moe.py
@@ -688,15 +688,15 @@ def round_up(x, y):
         w3_gs,
     ) = _quantize_weights(w1, w2, w3)
 
-    fc1_act_global = torch.tensor(1.0, device="cuda", dtype=torch.float32)
-    fc2_act_global = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    fc1_activation_gs = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    fc2_activation_gs = torch.tensor(1.0, device="cuda", dtype=torch.float32)
 
     routing_weights, selected_experts = compute_routing(router_logits, top_k)
 
     if precompute_fc_alphas:
         fc1_weight_gs = torch.max(w3_gs, w1_gs)
-        fc1_alpha = 1.0 / (fc1_act_global * fc1_weight_gs)
-        fc2_alpha = 1.0 / (fc2_act_global * w2_gs)
+        fc1_alpha = 1.0 / (fc1_activation_gs * fc1_weight_gs)
+        fc2_alpha = 1.0 / (fc2_activation_gs * w2_gs)
     else:
         fc1_alpha = None
         fc2_alpha = None
@@ -715,8 +715,8 @@ def round_up(x, y):
         w1_blockscale,
         w2_blockscale,
         w3_blockscale,
-        fc1_act_global,
-        fc2_act_global,
+        fc1_activation_gs,
+        fc2_activation_gs,
         fc1_alpha=fc1_alpha,
         fc2_alpha=fc2_alpha,
         input_blockscale=None,
@@ -728,12 +728,12 @@ def round_up(x, y):
     def compute_ref_output(w1_gs, w3_gs):
         # Quantize then dequantize the input to emulate the precision loss.
         a_fp4, a_scale_interleaved = torch.ops.trtllm.fp4_quantize(
-            x, fc1_act_global, NVFP4_BLOCK_SIZE
+            x, fc1_activation_gs, NVFP4_BLOCK_SIZE
         )
         x_dq = dequantize_nvfp4_to_dtype(
             a_fp4,
             a_scale_interleaved,
-            fc1_act_global,
+            fc1_activation_gs,
             dtype=otype,
             device=x.device,
             block_size=NVFP4_BLOCK_SIZE,