NVIDIA
diff --git a/‎megatron/core/distributed/fsdp/mcore_fsdp_adapter.py‎
Lines changed: 7 additions & 126 deletions b/‎megatron/core/distributed/fsdp/mcore_fsdp_adapter.py‎
Lines changed: 7 additions & 126 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/README.md‎
Lines changed: 0 additions & 11 deletions b/‎megatron/core/distributed/fsdp/src/README.md‎
Lines changed: 0 additions & 11 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py‎
Lines changed: 1 addition & 9 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/fully_shard.py‎
Lines changed: 1 addition & 9 deletions
diff --git a/‎megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py‎
Lines changed: 4 additions & 7 deletions b/‎megatron/core/distributed/fsdp/src/megatron_fsdp/megatron_fsdp.py‎
Lines changed: 4 additions & 7 deletions
@@ -13,7 +13,6 @@
 # limitations under the License.
 
 import logging
-import random
 from typing import List, Optional
 
 try:
@@ -23,7 +22,6 @@
 except ImportError:
     HAVE_EINOPS = False
 
-import numpy as np
 import torch
 import torch.distributed as dist
 
@@ -34,11 +32,10 @@
 except ImportError:
     HAVE_DTENSOR = False
 
-from megatron.core import parallel_state, tensor_parallel
+from megatron.core import parallel_state
 from megatron.core.config_logger import has_config_logger_enabled, log_config_to_disk
 from megatron.core.distributed.data_parallel_base import _BaseDataParallel
 from megatron.core.distributed.distributed_data_parallel_config import DistributedDataParallelConfig
-from megatron.core.extensions.transformer_engine import TELinear
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.transformer.transformer_config import TransformerConfig
 from megatron.core.transformer.transformer_layer import TransformerLayer
@@ -98,8 +95,6 @@ def __init__(
             else:
                 self.fsdp_unit_modules = []
 
-        self._fix_tensor_parallel_attributes(module)
-
         super().__init__(
             config=config,
             module=MegatronFSDP(
@@ -124,8 +119,6 @@ def __init__(
         self.module.state_dict_for_save_checkpoint = self.module.state_dict
         self.state_dict_for_save_checkpoint = self.state_dict
 
-        self.sync_rng_states_across_tp_group()
-
     def load_state_dict(self, state_dict, strict=True):
         """
         Load the state dictionary into the module.
@@ -148,44 +141,6 @@ def load_state_dict(self, state_dict, strict=True):
 
         self.module.load_state_dict(custom_state_dict, strict=strict)
 
-    def _fix_tensor_parallel_attributes(self, module):
-        is_expert_param = lambda n, p: ".experts." in n
-        is_router_param = lambda n, p: ".router.weight" in n
-
-        if parallel_state.get_tensor_model_parallel_group():
-            tp_size = parallel_state.get_tensor_model_parallel_group().size()
-        else:
-            tp_size = 1
-
-        if parallel_state.get_expert_tensor_parallel_group():
-            expt_tp_size = parallel_state.get_expert_tensor_parallel_group().size()
-        else:
-            expt_tp_size = 1
-
-        param_to_direct_module = {}
-        for name, m in module.named_modules():
-            for p in m.parameters(recurse=False):
-                param_to_direct_module[p] = (name, m)
-
-        for name, param in module.named_parameters():
-            if is_expert_param(name, param) and expt_tp_size > 1:
-                setattr(param, "_mcore_tp", True)
-                if "linear_fc1.weight" in name:
-                    setattr(param, "_tp_partition_dim", 0)
-                elif "linear_fc2.weight" in name:
-                    setattr(param, "_tp_partition_dim", 1)
-
-            if not is_expert_param(name, param) and tp_size > 1:
-                m_name, direct_module = param_to_direct_module[param]
-                if isinstance(direct_module, (TELinear,)):
-                    parallel_mode = getattr(direct_module, "parallel_mode", None)
-                    if parallel_mode is None:
-                        setattr(param, "_mcore_tp", True)
-                        setattr(param, "_tp_duplicated", True)
-                elif is_router_param(name, param):
-                    setattr(param, "_mcore_tp", True)
-                    setattr(param, "_tp_duplicated", True)
-
     def _init_dist_index(self, pg_collection):
         """
         Initialize the distributed index for the module.
@@ -199,7 +154,6 @@ def _init_dist_index(self, pg_collection):
         enable_hsdp = self.ddp_config.num_distributed_optimizer_instances > 1
         if pg_collection is None:
             tp_group = parallel_state.get_tensor_model_parallel_group()
-            expt_tp_group = parallel_state.get_expert_tensor_parallel_group()
             if enable_hsdp:
                 dp_cp_group = parallel_state.get_data_parallel_group(
                     with_context_parallel=True, partial_data_parallel=True
@@ -214,11 +168,8 @@ def _init_dist_index(self, pg_collection):
                 )
                 outer_fsdp_group = None
                 hybrid_fsdp_group = None
-                expt_dp_group = parallel_state.get_expert_data_parallel_group()
-                ep_group = parallel_state.get_expert_model_parallel_group()
         else:
             tp_group = getattr(pg_collection, 'tp', None)
-            expt_tp_group = getattr(pg_collection, 'expt_tp', None)
             if enable_hsdp:
                 dp_cp_group = pg_collection.intra_dp_cp
                 outer_fsdp_group = pg_collection.inter_dist_opt
@@ -227,17 +178,11 @@ def _init_dist_index(self, pg_collection):
                 dp_cp_group = pg_collection.dp_cp
                 outer_fsdp_group = None
                 hybrid_fsdp_group = None
-                expt_dp_group = getattr(pg_collection, 'expt_dp', None)
-                ep_group = getattr(pg_collection, 'ep', None)
 
         if tp_group is None:
             single_rank_group = dist.new_group(ranks=[dist.get_rank()])
             tp_group = single_rank_group
 
-        if expt_tp_group is None:
-            single_rank_group = dist.new_group(ranks=[dist.get_rank()])
-            expt_tp_group = single_rank_group
-
         if enable_hsdp:
             mesh = _get_hsdp_tp_mesh(outer_fsdp_group, dp_cp_group, tp_group)
             dist_index = FSDPDistributedIndex(
@@ -254,17 +199,6 @@ def _init_dist_index(self, pg_collection):
                 hybrid_fsdp_group=hybrid_fsdp_group,
             )
         else:
-            if ep_group is not None:
-                expt_mesh = _get_dp_tp_mesh(expt_dp_group, expt_tp_group, ep_size=ep_group.size())
-                expt_device_mesh = DeviceMesh.from_group(
-                    [expt_dp_group, expt_tp_group],
-                    device_type="cuda",
-                    mesh=expt_mesh.tolist(),
-                    mesh_dim_names=["dp_cp", "tp"],
-                )
-            else:
-                expt_device_mesh = None
-
             mesh = _get_dp_tp_mesh(dp_cp_group, tp_group)
             dist_index = FSDPDistributedIndex(
                 device_mesh=DeviceMesh.from_group(
@@ -275,11 +209,8 @@ def _init_dist_index(self, pg_collection):
                 ),
                 dp_shard_dim="dp_cp",
                 tp_dim="tp",
-                expt_device_mesh=expt_device_mesh,
             )
 
-        self.tp_group = tp_group
-
         return dist_index
 
     def stop_communication(self):
@@ -289,20 +220,6 @@ def stop_communication(self):
         self.module.synchronize_gradient_reduce()
         self.module.synchronize_param_gather()
 
-    def sync_rng_states_across_tp_group(self):
-        """
-        Synchronize the tensor parallel random number generator states.
-        """
-        if self.tp_group.size() <= 1:
-            return
-
-        if self.tp_group.rank() == 0:
-            broadcast_list = [_get_rng_state_dict()]
-        else:
-            broadcast_list = [None]
-        torch.distributed.broadcast_object_list(broadcast_list, group=self.tp_group, group_src=0)
-        _load_rng_state_dict(broadcast_list[0])
-
 
 def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
     assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
@@ -356,46 +273,29 @@ def _get_hsdp_tp_mesh(outer_fsdp_dp_group, dp_cp_group, tp_group):
     return mesh
 
 
-def _get_dp_tp_mesh(dp_cp_group, tp_group, ep_size=1):
+def _get_dp_tp_mesh(dp_cp_group, tp_group):
     assert HAVE_EINOPS, "einops is not installed. Please install it with `pip install einops`."
     world_size = dist.get_world_size()
 
     tp_size = dist.get_world_size(tp_group) if tp_group is not None else 1
-    # TODO: Supports configurable (dp, cp, ep, tp) order.
-    mesh = einops.rearrange(
-        torch.arange(world_size),
-        "(dp_cp ep tp) -> ep dp_cp tp",
-        dp_cp=dp_cp_group.size(),
-        tp=tp_size,
-        ep=ep_size,
-    )
+    # TODO: Supports configurable (dp, cp, tp) order.
+    mesh = einops.rearrange(torch.arange(world_size), "(dp_cp tp) -> dp_cp tp", tp=tp_size)
 
-    mesh_dp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (ep tp) dp_cp', dp_cp=dp_cp_group.size())
+    mesh_dp_ranks = einops.rearrange(mesh, 'dp_cp tp -> tp dp_cp', tp=tp_size)
     dp_cp_group_ranks = dist.get_process_group_ranks(dp_cp_group)
     assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_dp_ranks, dp_cp_group_ranks), (
         f"[Megatron-FSDP] Data Parallel ranks in the mesh {mesh_dp_ranks} "
         f"do not match the ranks in the DP group {dp_cp_group_ranks}."
     )
 
-    mesh_tp_ranks = einops.rearrange(mesh, 'ep dp_cp tp -> (dp_cp ep) tp', tp=tp_size)
+    mesh_tp_ranks = einops.rearrange(mesh, 'dp_cp tp -> (dp_cp) tp', tp=tp_size)
     tp_group_ranks = dist.get_process_group_ranks(tp_group)
     assert _check_mesh_ranks_and_group_ranks_are_consistent(mesh_tp_ranks, tp_group_ranks), (
         f"[Megatron-FSDP] Tensor Parallel ranks in the mesh {mesh_tp_ranks} "
         f"do not match the ranks in the TP group {tp_group_ranks}."
     )
 
-    # Exclude the expert parallel dimension
-    rank = dist.get_rank()
-    dp_tp_meshes = [per_ep_mesh for per_ep_mesh in mesh if rank in per_ep_mesh.reshape(-1).tolist()]
-    assert (
-        len(dp_tp_meshes) == 1
-    ), f"[Megatron-FSDP] Current rank {rank} is not unique in the mesh ranks {mesh.tolist()}."
-    assert len(dp_tp_meshes[0].reshape(-1).tolist()) == dp_cp_group.size() * tp_group.size(), (
-        f"[Megatron-FSDP] DP-TP mesh size {len(dp_tp_meshes[0].reshape(-1).tolist())} "
-        f"does not match expected size {dp_cp_group.size() * tp_group.size()}."
-    )
-
-    return dp_tp_meshes[0]
+    return mesh
 
 
 def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks):
@@ -410,22 +310,3 @@ def _check_mesh_ranks_and_group_ranks_are_consistent(mesh_ranks, group_ranks):
         f"{mesh_ranks.tolist()} does not match the group ranks {group_ranks}."
     )
     return sorted(current_ranks[0]) == sorted(group_ranks)
-
-
-def _get_rng_state_dict():
-    rng_state_dict = {
-        'random_rng_state': random.getstate(),
-        'np_rng_state': np.random.get_state(),
-        'torch_rng_state': torch.get_rng_state(),
-        'cuda_rng_state': torch.cuda.get_rng_state(),
-        'rng_tracker_states': tensor_parallel.get_cuda_rng_tracker().get_states(),
-    }
-    return rng_state_dict
-
-
-def _load_rng_state_dict(rng_state_dict):
-    random.setstate(rng_state_dict['random_rng_state'])
-    np.random.set_state(rng_state_dict['np_rng_state'])
-    torch.set_rng_state(rng_state_dict['torch_rng_state'])
-    torch.cuda.set_rng_state(rng_state_dict['cuda_rng_state'])
-    tensor_parallel.get_cuda_rng_tracker().set_states(rng_state_dict['rng_tracker_states'])
@@ -127,12 +127,6 @@ device_mesh[("dp_shard", "cp")]._flatten("dp_shard_cp")
 # Only required if using HSDP. Otherwise, don't pass hybrid_fsdp_group.
 device_mesh[("dp_outer", "dp_shard", "cp")]._flatten("hsdp")
 hsdp_group = device_mesh["hsdp"].get_group()
-# Initialize DeviceMesh for expert parallel (EP) modules when using FSDP + EP.
-expert_device_mesh = torch.distributed.device_mesh.init_device_mesh(
-    "cuda",
-    mesh_shape=(expt_dp_shard_size, expt_tp_size),
-    mesh_dim_names=("dp_shard", "tp"),
-)
 
 # Fully-shards your model and distributes your optimizer.
 model, optimizer = fully_shard(
@@ -151,8 +145,6 @@ model, optimizer = fully_shard(
     tp_dim="tp",
     # Only required when using HSDP. Otherwise, set this to None.
     hybrid_fsdp_group=hsdp_group,
-    # Only required for FSDP + EP. Otherwise, set this to None.
-    expt_device_mesh=expt_device_mesh,
     # FSDP Sharding Strategy: no_shard (0) / optim (1) / optim_grads (2) / optim_grads_params (3)
     zero_dp_strategy=3,
     outer_dp_sharding_strategy=1,
@@ -200,9 +192,6 @@ optimizer.load_state_dict(ckpt_state_dict["optimizer"])
   - `tp_dim` is the name of the sub-mesh used for tensor parallelism (TP), which is required for `(FSDP, TP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` TP.
     - For more information about tensor parallelism, refer to: [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053).
   - `hybrid_fsdp_group` is the `ProcessGroup` which contains all ranks in the flattened `dp_shard_dim` and `dp_outer_dim` sub-meshes utilized to specify the `(DP-Outer, DP-Shard)` sharded coordinate system for the weight and gradient buffers. Required for HSDP.
-- `expt_device_mesh` is another [`torch.distributed.DeviceMesh`](https://docs.pytorch.org/docs/stable/distributed.html#devicemesh) tailored for the expert parallel (EP) modules in `MegatronFSDP`.
-  - `dp_shard_dim` is the name of the sub-mesh required for FSDP sharding of the EP modules, enabling expert data parallelism (EDP).
-  - `tp_dim` is the name of the sub-mesh used for expert tensor parallelism (ETP), which is required for `(FSDP, ETP)`-strided sharding when using Megatron-LM or Torch-native `DTensor` ETP.
 - `init_model_with_meta_device` has `MegatronFSDP` initialize your `meta`-device model in shards on every CUDA device to avoid OOM when initializing extremely large models that cannot fit on a single device. Users can initialize their model on a [`meta`-device](https://docs.pytorch.org/docs/stable/meta.html) (`with torch.device('meta'): ...`), and ``MegatronFSDP`` will further shard and initialize the model parameters layer-by-layer adhering to the customizable `module.reset_parameters` method, which prevents the entire model from being allocated in memory at any point during runtime.
     - Defaults to `False`.
     - Note that the `device` argument which installs your model on a specific device or rank will be deactivated when `init_model_with_meta_device=True`.
 
@@ -64,7 +64,6 @@ def fully_shard_model(
     dp_outer_dim: Optional[str] = None,
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
-    expt_device_mesh: Optional[DeviceMesh] = None,
     fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None,
     zero_dp_strategy: str | int = 3,
     outer_dp_sharding_strategy: str | int = 0,
@@ -184,10 +183,8 @@ def fully_shard_model(
         tp_dim=tp_dim,
         # Only required for HSDP.
         hybrid_fsdp_group=hybrid_fsdp_group,
-        # Access to flattened DP rank assignments for HSDP.
+        # Access to flattened DP rank assignments for HFSDP.
         hsdp_outer_dp_shard=_outer_fsdp_sharding,
-        # Only required for Megatron-FSDP + EP.
-        expt_device_mesh=expt_device_mesh,
     )
 
     # Wrap model in Megatron FSDP.
@@ -333,7 +330,6 @@ def fully_shard(
     dp_outer_dim: Optional[str] = None,
     tp_dim: Optional[str] = None,
     hybrid_fsdp_group: Optional[torch.distributed.ProcessGroup] = None,
-    expt_device_mesh: Optional[DeviceMesh] = None,
     fsdp_unit_modules: Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]] = None,
     zero_dp_strategy: str | int = 3,
     outer_dp_sharding_strategy: str | int = 0,
@@ -395,9 +391,6 @@ def fully_shard(
             by flattening the outer-FSDP (dp_outer_dim) and FSDP (dp_shard_dim) process groups
             or sub-meshes. Defaults to None. Required for HSDP, i.e. if dp_outer_dim is not None.
 
-        expt_device_mesh (Optional[DeviceMesh]):
-            Expert parallel device mesh object defining the topology for MoE distributed training.
-
         fsdp_unit_modules (Optional[Sequence[Type[torch.nn.Module]] | Sequence[str]]):
             List of (sub-)module classes or (sub-)module class import paths that are "units",
             which are torch.nn.Module(s) that are sharded and scheduled by Megatron-FSDP.
@@ -510,7 +503,6 @@ def fully_shard(
         dp_outer_dim=dp_outer_dim,
         tp_dim=tp_dim,
         hybrid_fsdp_group=hybrid_fsdp_group,
-        expt_device_mesh=expt_device_mesh,
         fsdp_unit_modules=fsdp_unit_modules,
         zero_dp_strategy=zero_dp_strategy,
         outer_dp_sharding_strategy=outer_dp_sharding_strategy,
 
@@ -235,10 +235,7 @@ def __init__(
         self.dist_index = dist_index
 
         # If Megatron Expert Parallelism is enabled, you need to provide an expt_dp_group.
-        if (
-            has_expert_parameters
-            and self.dist_index.get_fsdp_group(is_expert_parallel=True) is None
-        ):
+        if has_expert_parameters and self.dist_index.get_expert_dp_group() is None:
             raise ValueError(
                 "[Megatron-FSDP] Megatron Expert Parallelism is enabled, but no expt_dp_group is"
                 "provided."
@@ -356,7 +353,9 @@ def _init_fsdp_param_and_grad_buffer(self):
         )
 
         # Set the suggested communication unit size for reduce-scatter and all-gather pipelines.
-        suggested_communication_unit_size = self.ddp_config.suggested_communication_unit_size
+        suggested_communication_unit_size = (
+            self.ddp_config.suggested_communication_unit_size or 1_000_000_000
+        )
         if suggested_communication_unit_size is None:
             if self.data_parallel_sharding_strategy == "optim_grads_params":
                 total_param_elements = 0
@@ -371,8 +370,6 @@ def _init_fsdp_param_and_grad_buffer(self):
                 suggested_communication_unit_size = total_param_elements // total_fsdp_module * 2
             elif self.bucket_size is not None:
                 suggested_communication_unit_size = self.bucket_size
-            else:
-                suggested_communication_unit_size = 1_000_000_000
 
             # Cap to 1B elements.
             suggested_communication_unit_size = max(