ADLR/megatron-lm!4225 - [Dev][NVFP4][MOE] Proper NVFP4 Zero Padding for MOE

zhongbozhu · zhongbozhu · commit 3a94887a4eef · 2025-10-16T16:19:28.000-07:00
Co-authored-by: Zhongbo Zhu &lt;zhongboz@nvidia.com&gt;
diff --git a/megatron/core/fp4_utils.py b/megatron/core/fp4_utils.py
@@ -49,7 +49,7 @@ def is_nvfp4tensor(tensor: torch.Tensor) -> bool:
 
 def get_fp4_align_size(fp4_recipe: Fp4Recipe) -> int:
     """
-    Get the alignment size required for FP4 GEMM. 
+    Get the alignment size required for FP4 GEMM.
     FP4 GEMM requires Blackwell and later architectures.
 
     The value 32 is a hardware requirement: TMA (Tensor Memory Accelerator) requires
diff --git a/megatron/core/transformer/moe/experts.py b/megatron/core/transformer/moe/experts.py
@@ -21,8 +21,8 @@
     ShardedTensorFactory,
 )
 from megatron.core.dist_checkpointing.utils import replace_prefix_for_sharding
-from megatron.core.fp8_utils import get_fp8_align_size
 from megatron.core.fp4_utils import get_fp4_align_size
+from megatron.core.fp8_utils import get_fp8_align_size
 from megatron.core.fusions.fused_bias_geglu import quick_gelu, weighted_bias_quick_geglu_impl
 from megatron.core.fusions.fused_bias_swiglu import weighted_bias_swiglu_impl
 from megatron.core.fusions.fused_weighted_squared_relu import weighted_squared_relu_impl
@@ -136,7 +136,9 @@ def glu(x):
             and "moe_act" in self.config.recompute_modules
         )
         if self.activation_recompute and (self.config.fp8 or self.config.fp4):
-            raise ValueError("moe_act recompute for fp8 or fp4 cannot work with the legacy GroupedMLP.")
+            raise ValueError(
+                "moe_act recompute for fp8 or fp4 cannot work with the legacy GroupedMLP."
+            )
 
         @jit_fuser
         def activation_func_with_probs(x, probs):
diff --git a/megatron/core/transformer/moe/token_dispatcher.py b/megatron/core/transformer/moe/token_dispatcher.py
@@ -8,8 +8,8 @@
 
 from megatron.core import utils
 from megatron.core.config import is_experimental_enabled
-from megatron.core.fp8_utils import get_fp8_align_size
 from megatron.core.fp4_utils import get_fp4_align_size
+from megatron.core.fp8_utils import get_fp8_align_size
 from megatron.core.fusions.fused_indices_converter import fused_indices_to_multihot
 from megatron.core.fusions.fused_pad_routing_map import fused_pad_routing_map
 from megatron.core.tensor_parallel import (
@@ -1143,6 +1143,14 @@ def get_restored_hidden_states_by_experts(self, hidden_states: torch.Tensor) ->
         )
         return hidden_states
 
+    def get_align_size_for_quantization(self):
+        """Get the alignment size for quantization."""
+        if self.config.fp8:
+            return get_fp8_align_size(self.config.fp8_recipe)
+        elif self.config.fp4:
+            return get_fp4_align_size(self.config.fp4_recipe)
+        return 16
+
 
 class MoEFlexTokenDispatcher(MoETokenDispatcher):
     """A flexible token dispatcher that abstracts the underlying tensor and expert
diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py
@@ -1313,13 +1313,16 @@ def __post_init__(self):
         if self.moe_router_padding_for_fp8:
             # enable moe_router_padding_for_quantization
             warnings.warn(
-                "--moe-router-padding-for-fp8 is going to be deprecated. Use --moe-router-padding-for-quantization instead."
+                "--moe-router-padding-for-fp8 is going to be deprecated. "
+                "Use --moe-router-padding-for-quantization instead."
             )
             self.moe_router_padding_for_quantization = True
 
         if self.moe_router_padding_for_quantization:
             if self.fp8 is None and self.fp4 is None:
-                raise ValueError("fp8/fp4 must be specified when moe_router_padding_for_quantization is True.")
+                raise ValueError(
+                    "fp8/fp4 must be specified when moe_router_padding_for_quantization is True."
+                )
 
             if self.moe_token_dispatcher_type in ["allgather", "alltoall_seq"]:
                 raise ValueError(