skyw
diff --git a/‎megatron/core/dist_checkpointing/state_dict_utils.py‎
Lines changed: 6 additions & 1 deletion b/‎megatron/core/dist_checkpointing/state_dict_utils.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎megatron/core/dist_checkpointing/utils.py‎
Lines changed: 17 additions & 0 deletions b/‎megatron/core/dist_checkpointing/utils.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎megatron/core/distributed/distributed_data_parallel.py‎
Lines changed: 29 additions & 21 deletions b/‎megatron/core/distributed/distributed_data_parallel.py‎
Lines changed: 29 additions & 21 deletions
diff --git a/‎megatron/core/distributed/param_and_grad_buffer.py‎
Lines changed: 15 additions & 5 deletions b/‎megatron/core/distributed/param_and_grad_buffer.py‎
Lines changed: 15 additions & 5 deletions
diff --git a/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 48 additions & 6 deletions b/‎megatron/core/extensions/transformer_engine.py‎
Lines changed: 48 additions & 6 deletions
diff --git a/‎megatron/core/fp8_utils.py‎
Lines changed: 30 additions & 1 deletion b/‎megatron/core/fp8_utils.py‎
Lines changed: 30 additions & 1 deletion
diff --git a/‎megatron/core/models/bert/bert_model.py‎
Lines changed: 3 additions & 1 deletion b/‎megatron/core/models/bert/bert_model.py‎
Lines changed: 3 additions & 1 deletion
@@ -13,7 +13,7 @@
     StateDict,
     apply_factories,
 )
-from .utils import extract_nonpersistent, extract_sharded_base
+from .utils import _clean_metadata_for_serialization, extract_nonpersistent, extract_sharded_base
 from .validation import determine_global_metadata, validate_sharding_integrity
 
 
@@ -43,6 +43,11 @@ def save_preprocess(
     sharded_part = filter_out_empty_flatten_tensor(sharded_part)
     if validate_access_integrity:
         preprocessed_common_state_dict = common_state_dict
+        if "content_metadata" in preprocessed_common_state_dict:
+            preprocessed_common_state_dict["content_metadata"] = _clean_metadata_for_serialization(
+                preprocessed_common_state_dict["content_metadata"]
+            )
+
         if preprocess_common_before_consistancy_check:
             preprocessed_common_state_dict = preprocess_common_before_consistancy_check(
                 common_state_dict
 
@@ -330,3 +330,20 @@ def debug_msg(msg: str):
     """
     with logger_stack(None, None) as (stacked_name, last_logger):
         last_logger.debug(f"{stacked_name} {msg}")
+
+
+def _clean_metadata_for_serialization(metadata: dict) -> dict:
+    """Create a clean copy of metadata for serialization by removing non-serializable objects.
+
+    Args:
+        metadata: Original metadata dict
+
+    Returns:
+        Clean metadata dict suitable for serialization
+    """
+    if metadata is None:
+        return None
+    clean_metadata = metadata.copy()
+    # Remove dp_cp_group as it's not serializable
+    clean_metadata.pop('dp_cp_group', None)
+    return clean_metadata
@@ -8,7 +8,7 @@
 
 from .. import parallel_state
 from ..config_logger import has_config_logger_enabled, log_config_to_disk
-from ..fp8_utils import is_float8tensor
+from ..fp8_utils import is_float8tensor, post_all_gather_processing
 from ..process_groups_config import ProcessGroupCollection
 from ..transformer.cuda_graphs import is_graph_capturing
 from ..transformer.transformer_config import TransformerConfig
@@ -500,26 +500,34 @@ def start_param_sync(self, *unused, force_sync: bool = False, force_dispatch: bo
 
         for bucket_group in self.bucket_groups + self.expert_parallel_bucket_groups:
             bucket_group.start_param_sync(force_sync=force_sync)
-            # For MXFP8 params, we need to copy the all-gathered param data from the buffer to
-            # the param.data, since param buffer is not mapped to model params for MXFP8 case.
-            # The paramaters are cast from bf16 to MXFP8 during copy.
-            # In the case of "overlap_param_gather=True", the param copy is done
-            # in "finish_param_sync" stage after zeroing the shared gardient buffers.
-            if (
-                self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
-                and not self.ddp_config.overlap_param_gather
-            ):
-                for bucket in bucket_group.buckets:
-                    for param in bucket.params:
-                        param_start, param_end = bucket.param_to_index[param]
-                        param_slice = bucket.param_data.view(-1)[param_start:param_end]
-                        param.data.copy_(param_slice.view(param.data.shape))
-                    # All-gathered params are not needed after being copied to param.data.
-                    # Zero out the param buffer (shared with grad buffer) for gradient accumulation.
-                    # We cannot zero out the entire grad buffer because one grad buffer may
-                    # correspond to multiple param buffers. If we zero out the entire grad buffer,
-                    # it would clear the data of those param buffers that have not yet completed AG.
-                    bucket.param_data.zero_()
+
+            if not self.ddp_config.overlap_param_gather:
+                # For MXFP8 params, we need to copy the all-gathered param data from the buffer to
+                # the param.data, since param buffer is not mapped to model params for MXFP8 case.
+                # The paramaters are cast from bf16 to MXFP8 during copy.
+                # In the case of "overlap_param_gather=True", the param copy is done
+                # in "finish_param_sync" stage after zeroing the shared gardient buffers.
+                if self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag:
+                    for bucket in bucket_group.buckets:
+                        for param in bucket.params:
+                            param_start, param_end = bucket.param_to_index[param]
+                            param_slice = bucket.param_data.view(-1)[param_start:param_end]
+                            param.data.copy_(param_slice.view(param.data.shape))
+                        # All-gathered params are not needed after being copied to param.data.
+                        # Zero out the param buffer (shared with grad buffer) for gradient
+                        # accumulation. We cannot zero out the entire grad buffer because one grad
+                        # buffer may correspond to multiple param buffers. If we zero out the entire
+                        # grad buffer, it would clear the data of those param buffers that have not
+                        # yet completed AG.
+                        bucket.param_data.zero_()
+                else:
+                    fp8_params = []
+                    for bucket in bucket_group.buckets:
+                        for param in bucket.params:
+                            if is_float8tensor(param):
+                                fp8_params.append(param)
+                    if len(fp8_params) > 0:
+                        post_all_gather_processing(fp8_params)
 
     def start_grad_sync(self, *unused):
         """
 
@@ -17,7 +17,12 @@
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.rerun_state_machine import get_rerun_state_machine
 
-from ..fp8_utils import is_float8tensor, is_mxfp8tensor, modify_underlying_storage
+from ..fp8_utils import (
+    is_float8tensor,
+    is_mxfp8tensor,
+    modify_underlying_storage,
+    post_all_gather_processing,
+)
 from ..utils import is_torch_min_version, log_on_each_pipeline_stage
 from .distributed_data_parallel_config import DistributedDataParallelConfig
 from .reduce_scatter_with_fp32_accumulation import reduce_scatter_with_fp32_accumulation
@@ -311,10 +316,7 @@ def finish_param_sync(self, skip_next_bucket_dispatch: bool = False):
             # For the mxfp8_param with "reuse_grad_buf_for_mxfp8_param_ag=True",
             # we need to copy the param_data from the shared_param/grad_buffer to param.data
             # after the param all-gather.
-            if (
-                self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag
-                and self.ddp_config.overlap_param_gather
-            ):
+            if self.ddp_config.reuse_grad_buf_for_mxfp8_param_ag:
                 for bucket in self.buckets:
                     for param in bucket.params:
                         param_start, param_end = bucket.param_to_index[param]
@@ -326,6 +328,14 @@ def finish_param_sync(self, skip_next_bucket_dispatch: bool = False):
                     # correspond to multiple param buffers. If we zero out the entire grad buffer,
                     # it would clear the data of those param buffers that have not yet completed AG.
                     bucket.param_data.zero_()
+            else:
+                fp8_params = []
+                for bucket in self.buckets:
+                    for param in bucket.params:
+                        if is_float8tensor(param):
+                            fp8_params.append(param)
+                if len(fp8_params) > 0:
+                    post_all_gather_processing(fp8_params)
 
     def start_grad_sync(self):
         """
 
@@ -42,6 +42,7 @@
 from megatron.core.transformer.mlp import MLP
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.utils import (
+    ensure_metadata_has_dp_cp_group,
     is_layer_window_attention,
     make_sharded_tensors_for_checkpoint,
 )
@@ -420,6 +421,9 @@ def __init__(
                     # duplicated across TP ranks
                     setattr(param, "sequence_parallel", self.config.sequence_parallel)
 
+        tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
+
     def forward(self, x):
         """Forward."""
         _is_first_microbatch = (
@@ -444,7 +448,14 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
             self.parallel_mode is None
         ), "TELinear sharded_state_dict can only be used with duplicated parallel mode"
         state_dict = self.state_dict(prefix="", keep_vars=True)
-        return make_sharded_tensors_for_checkpoint(state_dict, prefix, None, sharded_offsets)
+        return make_sharded_tensors_for_checkpoint(
+            state_dict,
+            prefix,
+            None,
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
+        )
 
     def backward_dw(self):
         """Compute weight gradients during the backward pass if delay_wgrad_compute is enabled."""
@@ -492,6 +503,7 @@ def __init__(
 
         # TODO: For backward compatibility, remove in v0.15.
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
 
         # TE returns a zero length Tensor when bias=False and
         # return_bias=True, but we prefer None.  So in that case we
@@ -625,9 +637,15 @@ def forward(self, x):
 
     def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
+        metadata = ensure_metadata_has_dp_cp_group(metadata)
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 0, "bias": 0},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
     def __repr__(self):
@@ -670,6 +688,7 @@ def __init__(
         if gather_output:
             raise ValueError("Transformer Engine linear layers do not support gather_output = True")
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
         world_size = get_pg_size(tp_group)
         rank = get_pg_rank(tp_group)
 
@@ -720,7 +739,12 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 0, bias sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 0, "bias": 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 0, "bias": 0},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
     def __repr__(self):
@@ -764,6 +788,7 @@ def __init__(
                 "Transformer Engine linear layers do not support input_is_parallel = False"
             )
         tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+        self._tp_group = tp_group
 
         super().__init__(
             input_size=input_size,
@@ -814,7 +839,12 @@ def sharded_state_dict(self, prefix="", sharded_offsets=(), metadata=None):
         """Sharding along axis 1, bias not sharded"""
         state_dict = self.state_dict(prefix="", keep_vars=True)
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {"weight": 1}, sharded_offsets
+            state_dict,
+            prefix,
+            {"weight": 1},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
     def __repr__(self):
@@ -901,6 +931,7 @@ def __init__(
                 assert hasattr(
                     pg_collection, "hcp"
                 ), "TEDotProductAttention pg_collection must have hierarchical cp pg"
+        self._tp_group = pg_collection.tp
 
         if is_te_min_version("0.10.0"):
             extra_kwargs["attention_type"] = attention_type
@@ -1078,7 +1109,12 @@ def sharded_state_dict(
         else:
             state_dict = {}
         return make_sharded_tensors_for_checkpoint(
-            state_dict, prefix, {'softmax_offset': 0}, sharded_offsets
+            state_dict,
+            prefix,
+            {'softmax_offset': 0},
+            sharded_offsets,
+            tp_group=self._tp_group,
+            dp_cp_group=metadata["dp_cp_group"],
         )
 
 
@@ -1138,6 +1174,7 @@ def __init__(
             # The comms between TP and EP group is explicitly handled by MoE token dispatcher.
             # So we disable comms by making TE agnostic of model parallel.
             tp_group = get_tensor_model_parallel_group_if_none(tp_group, is_expert=is_expert)
+            self._tp_group = tp_group
             tp_size = get_pg_size(tp_group)
 
             self.explicit_expert_comm = is_expert and (tp_size > 1 or self.expert_parallel)
@@ -1372,7 +1409,12 @@ def _sharded_state_dict_grouped(
                         (ep_axis, global_expert_idx, num_global_experts),
                     )
                 sub_sd = make_sharded_tensors_for_checkpoint(
-                    state_dict, '', tp_axis_map, new_sharded_offsets
+                    state_dict,
+                    '',
+                    tp_axis_map,
+                    new_sharded_offsets,
+                    tp_group=self._tp_group,
+                    dp_cp_group=metadata["dp_cp_group"],
                 )
                 # Remove expert layers indexing from sharded keys
                 replace_prefix_for_sharding(sub_sd, f"{gemm_idx}.", expert_prefix)
 
@@ -85,6 +85,13 @@
     Fp8Padding = None
     Fp8Unpadding = None
 
+try:
+    from transformer_engine.pytorch.tensor.utils import (
+        post_all_gather_processing as te_post_all_gather_processing,
+    )
+except ImportError:
+    te_post_all_gather_processing = None
+
 
 def is_float8tensor(tensor: torch.Tensor) -> bool:
     """Check if a tensor is a Transformer Engine Float8Tensor.
@@ -247,7 +254,15 @@ def _quantize_param_shard_impl(
                 raise NotImplementedError(
                     f"FSDP with --fp8-param-gather is not supported in TE v{get_te_version()}"
                 )
-        cast_master_weights_to_fp8(*args)
+
+        # For newer TE versions (i.e., have post_all_gather_processing function), we keep the
+        # columnwise data and manually call post_all_gather_processing after all-gather, this
+        # makes fp8 params compatible with CUDA graph.
+        kwargs = {}
+        if te_post_all_gather_processing is not None:
+            kwargs["manual_post_all_gather_processing"] = True
+
+        cast_master_weights_to_fp8(*args, **kwargs)
 
     def _correct_amax_history_if_needed_impl(model: List[torch.nn.Module]) -> None:
         pass
@@ -481,6 +496,20 @@ def correct_amax_history_if_needed(model: List[torch.nn.Module]):
     _correct_amax_history_if_needed_impl(model)
 
 
+def post_all_gather_processing(model_params):
+    """
+    Post-processing after all-gather for weights in distributed optimizer.
+    - tensorwise: may need to create a transposed view to match backend GEMM.
+    - blockwise: create column-wise storage.
+    """
+    if te_post_all_gather_processing is not None:
+        te_post_all_gather_processing(model_params)
+    else:
+        # If the TE version is old and does not have post_all_gather_processing function, this is
+        # a no-op, and the transpose/columnwise data will be created in the next forward pass.
+        pass
+
+
 def is_first_last_bf16_layer(config: TransformerConfig, layer_no: int):
     """Check if the layer is in bf16."""
     num_bf16_layers_at_start = (
 
@@ -14,6 +14,7 @@
 from megatron.core.models.common.embeddings.language_model_embedding import LanguageModelEmbedding
 from megatron.core.models.common.embeddings.rotary_pos_embedding import RotaryEmbedding
 from megatron.core.models.common.language_module.language_module import LanguageModule
+from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.dot_product_attention import (
     DotProductAttention as MCoreDotProductAttention,
 )
@@ -73,9 +74,10 @@ def __init__(
         seq_len_interpolation_factor: Optional[float] = None,
         add_binary_head=True,
         return_embeddings=False,
+        pg_collection: Optional[ProcessGroupCollection] = None,
         vp_stage: Optional[int] = None,
     ):
-        super(BertModel, self).__init__(config=config)
+        super(BertModel, self).__init__(config=config, pg_collection=pg_collection)
 
         if has_config_logger_enabled(config):
             log_config_to_disk(config, locals(), prefix=type(self).__name__)