attempt to fix NVFP4 + selective recompute

zhongbozhu · zhongbozhu · commit f2e34db99480 · 2025-10-27T17:44:11.000-07:00
Signed-off-by: Zhongbo Zhu &lt;zhongboz@nvidia.com&gt;
diff --git a/megatron/core/transformer/attention.py b/megatron/core/transformer/attention.py
@@ -205,12 +205,20 @@ def __init__(
 
         if (
             HAVE_TE
-            and self.config.fp8
-            and self.config.fp8_recipe != 'delayed'
-            and is_te_min_version("2.6.0dev0")
             and isinstance(self.linear_proj, TELinear)
+            and (
+                (
+                    self.config.fp8
+                    and self.config.fp8_recipe != 'delayed'
+                    and is_te_min_version("2.6.0dev0")
+                )
+                or (
+                    self.config.fp4
+                    and is_te_min_version("2.7.0.dev0")
+                )
+            )
         ):
-            # For fp8 training, the output of the fused core_attn is saved by itself, and
+            # For fp8/fp4 training, the output of the fused core_attn is saved by itself, and
             # linear_proj also saves the quantized tensor of this output. Here we set the
             # linear_proj to save the original input tensors to avoid the extra memory usage of
             # the quantized tensor.
@@ -1129,7 +1137,7 @@ def _backward_output_proj(self):
         self.linear_proj.backward_dw()
 
     def set_for_recompute_input_layernorm(self):
-        """Set the attention layer for recompute input_layernorm. Only needed for fp8."""
+        """Set the attention layer for recompute input_layernorm. Only needed for fp8/fp4."""
         from megatron.core.extensions.transformer_engine import set_save_original_input
 
         set_save_original_input(self.linear_qkv)
diff --git a/megatron/core/transformer/moe/moe_layer.py b/megatron/core/transformer/moe/moe_layer.py
@@ -204,7 +204,7 @@ def experts_compute(
         if self.use_shared_expert and not self.shared_expert_overlap:
             # Compute the shared expert separately when not overlapped with communication.
             if self.shared_experts_recompute:
-                if self.config.fp8:
+                if self.config.fp8 or self.config.fp4:
                     shared_expert_output = te_checkpoint(
                         self.shared_experts,
                         False,
@@ -272,7 +272,7 @@ def custom_forward(hidden_states):
             return output, mlp_bias
 
         if self.moe_layer_recompute:
-            if self.config.fp8:
+            if self.config.fp8 or self.config.fp4:
                 output, mlp_bias = te_checkpoint(
                     custom_forward,
                     False,
@@ -294,7 +294,7 @@ def backward_dw(self):
             self.shared_experts.backward_dw()
 
     def set_for_recompute_pre_mlp_layernorm(self):
-        """Set the MoE layer for recompute pre_mlp_layernorm. Only needed for fp8."""
+        """Set the MoE layer for recompute pre_mlp_layernorm. Only needed for fp8/fp4."""
         # If shared_experts_recompute is used, nothing needs to be done because the checkpoint
         # function will save the original input tensors.
         if self.shared_experts is not None and not self.shared_experts_recompute:
diff --git a/megatron/core/transformer/moe/shared_experts.py b/megatron/core/transformer/moe/shared_experts.py
@@ -62,8 +62,11 @@ def __init__(
         else:
             self.gate_weight = None
 
-        if self.config.fp8 and is_te_min_version("2.6.0dev0"):
-            # For fp8 training, the output of pre_mlp_layernorm is saved by router, and
+        if (
+            (self.config.fp8 and is_te_min_version("2.6.0dev0"))
+            or (self.config.fp4 and is_te_min_version("2.7.0.dev0"))
+        ):
+            # For fp8/fp4 training, the output of pre_mlp_layernorm is saved by router, and
             # the shared expert linear_fc1 also saves the quantized tensor of this output.
             # Here we set the linear_fc1 to save the original input tensors to avoid the extra
             # memory usage of the quantized tensor.
diff --git a/megatron/core/transformer/multi_latent_attention.py b/megatron/core/transformer/multi_latent_attention.py
@@ -177,12 +177,20 @@ def __init__(
 
         if (
             HAVE_TE
-            and self.config.fp8
-            and self.config.fp8_recipe != 'delayed'
-            and is_te_min_version("2.6.0dev0")
             and isinstance(self.linear_proj, TELinear)
+            and (
+                (
+                    self.config.fp8
+                    and self.config.fp8_recipe != 'delayed'
+                    and is_te_min_version("2.6.0dev0")
+                )
+                or (
+                    self.config.fp4
+                    and is_te_min_version("2.7.0.dev0")
+                )
+            )
         ):
-            # For fp8 training, the output of the fused core_attn is saved by itself, and
+            # For fp8/fp4 training, the output of the fused core_attn is saved by itself, and
             # linear_proj also saves the quantized tensor of this output. Here we set the
             # linear_proj to save the original input tensors to avoid the extra memory usage of
             # the quantized tensor.
@@ -781,7 +789,8 @@ def qkv_up_proj_and_rope_apply(q_compressed, kv_compressed, k_pos_emb, rotary_po
             return query, key, value
 
         if self.recompute_up_proj:
-            self.qkv_up_checkpoint = tensor_parallel.CheckpointWithoutOutput(fp8=self.config.fp8)
+            quantization = self.config.fp8 or self.config.fp4
+            self.qkv_up_checkpoint = tensor_parallel.CheckpointWithoutOutput(fp8=quantization)
             query, key, value = self.qkv_up_checkpoint.checkpoint(
                 qkv_up_proj_and_rope_apply, q_compressed, kv_compressed, k_pos_emb, rotary_pos_emb
             )
@@ -911,7 +920,7 @@ def _backward_output_proj(self):
         self.linear_proj.backward_dw()
 
     def set_for_recompute_input_layernorm(self):
-        """Set the attention layer for recompute input_layernorm. Only needed for fp8."""
+        """Set the attention layer for recompute input_layernorm. Only needed for fp8/fp4."""
         from megatron.core.extensions.transformer_engine import set_save_original_input
 
         if self.config.q_lora_rank is not None:
diff --git a/megatron/core/transformer/multi_token_prediction.py b/megatron/core/transformer/multi_token_prediction.py
@@ -570,6 +570,11 @@ def _proj_and_transformer_layer(
             fp8_context = nullcontext()
             transformer_layer_fp8_context = nullcontext()
 
+        # TODO: currently no support for FP4 in MTP layers because we need more numerical validation
+        # raise Error here to avoid unexpected behavior
+        if self.config.fp4:
+            raise ValueError("FP4 is not supported for MTP layers yet.")
+
         with rng_context:
             with fp8_context:
                 hidden_states = self._concat_embeddings(hidden_states, decoder_input)
diff --git a/megatron/core/transformer/transformer_layer.py b/megatron/core/transformer/transformer_layer.py
@@ -391,11 +391,11 @@ def __init__(
                     and not self.config.external_cuda_graph
                 ):
                     self.recompute_input_layernorm = True
-                    if self.config.fp8:
+                    if self.config.fp8 or self.config.fp4:
                         self.self_attention.set_for_recompute_input_layernorm()
                 if not isinstance(self.pre_mlp_layernorm, IdentityOp):
                     self.recompute_pre_mlp_layernorm = True
-                    if self.config.fp8:
+                    if self.config.fp8 or self.config.fp4:
                         if isinstance(self.mlp, MoELayer):
                             self.mlp.set_for_recompute_pre_mlp_layernorm()
                         else:
@@ -595,7 +595,7 @@ def _forward_mlp(self, hidden_states, inference_context=None):
         )
 
         if self.recompute_mlp:
-            if self.config.fp8:
+            if self.config.fp8 or self.config.fp4:
                 # import here to avoid circular import
                 from megatron.core.extensions.transformer_engine import te_checkpoint