Fix Mamba TP and remove confusing legacy initialization (#2202)

jaredcasper · rogerwaleffe · santhnm2 · web-flow · commit e4b7259801d2 · 2025-11-20T06:15:50.000Z
Co-authored-by: Roger Waleffe &lt;rwaleffe@nvidia.com&gt;
Co-authored-by: Keshav Santhanam &lt;ksanthanam@nvidia.com&gt;
diff --git a/megatron/core/ssm/mamba_block.py b/megatron/core/ssm/mamba_block.py
@@ -5,10 +5,8 @@
 # This source code is licensed under the Apache license found in the
 # LICENSE file in the root directory of this source tree.
 
-import math
 from contextlib import nullcontext
 from dataclasses import dataclass
-from functools import partial
 from typing import Optional, Tuple, Union
 
 import torch
@@ -23,7 +21,6 @@
 from megatron.core.process_groups_config import ProcessGroupCollection
 from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols as LayerSymbols
 from megatron.core.ssm.mamba_hybrid_layer_allocation import allocate_layers
-from megatron.core.tensor_parallel import get_cuda_rng_tracker
 from megatron.core.transformer import TransformerConfig
 from megatron.core.transformer.identity_op import IdentityOp
 from megatron.core.transformer.module import MegatronModule
@@ -33,50 +30,6 @@
 from megatron.core.utils import WrappedTensor, deprecate_inference_params, make_viewless_tensor
 
 
-# https://github.com/huggingface/transformers/blob/c28d04e9e252a1a099944e325685f14d242ecdcd/src/transformers/models/gpt2/modeling_gpt2.py#L454
-def _init_weights(
-    module,
-    n_layer,
-    initializer_range=0.02,  # Now only used for embedding layer.
-    rescale_prenorm_residual=True,
-    n_residuals_per_layer=1,  # Change to 2 if we have MLP
-):
-    with get_cuda_rng_tracker().fork():
-        if isinstance(module, nn.Linear):
-            if not getattr(module.weight, "_no_reinit", False):
-                nn.init.normal_(module.weight, std=initializer_range)
-            if module.bias is not None:
-                if not getattr(module.bias, "_no_reinit", False):
-                    nn.init.zeros_(module.bias)
-        elif isinstance(module, nn.Embedding):
-            nn.init.normal_(module.weight, std=initializer_range)
-
-        for name, p in module.named_parameters():
-            if name in ["conv1d.weight", "out_proj.weight"]:
-                nn.init.kaiming_uniform_(p, a=math.sqrt(5))
-            if name in ["in_proj.weight"]:
-                nn.init.normal_(p, mean=0.0, std=initializer_range)
-
-        if rescale_prenorm_residual:
-            # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
-            #   > A modified initialization which accounts for the accumulation on the
-            #   > residual path with model depth. Scale
-            #   > the weights of residual layers at initialization by a factor of
-            #   > 1/√N where N is the # of residual layers.
-            #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
-            #
-            # Reference (Megatron-LM):
-            # https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
-            for name, p in module.named_parameters():
-                if name in ["out_proj.weight", "fc2.weight"]:
-                    # Special Scaled Initialization
-                    nn.init.normal_(
-                        p,
-                        mean=0.0,
-                        std=initializer_range / math.sqrt(n_residuals_per_layer * n_layer),
-                    )
-
-
 @dataclass
 class MambaStackSubmodules:
     """
@@ -210,14 +163,6 @@ def __init__(
                 eps=self.config.layernorm_epsilon,
             )
 
-        self.apply(
-            partial(
-                _init_weights,
-                n_layer=self.config.num_layers,
-                initializer_range=self.config.init_method_std,
-            )
-        )
-
     def _select_layers_for_pipeline_parallel(self, layer_type_list):
         num_layers_per_pipeline_rank = self.config.num_layers // self.pp_group.size()
 
diff --git a/megatron/core/ssm/mamba_mixer.py b/megatron/core/ssm/mamba_mixer.py
@@ -293,6 +293,8 @@ def __init__(
 
             if self.conv_init is not None:
                 nn.init.uniform_(self.conv1d.weight, -self.conv_init, self.conv_init)
+            else:
+                nn.init.kaiming_uniform_(self.conv1d.weight, a=math.sqrt(5))
 
         self.activation = "silu"
         self.act = nn.SiLU()
@@ -311,13 +313,6 @@ def __init__(
             # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
             inv_dt = dt + torch.log(-torch.expm1(-dt))
             self.dt_bias = nn.Parameter(inv_dt)
-            # Our initialization would set all Linear.bias to zero,
-            # need to mark this one as _no_reinit
-            self.dt_bias._no_reinit = True
-            # Just to be explicit. Without this we already don't
-            # put wd on dt_bias because of the check
-            # name.endswith("bias") in param_grouping.py
-            self.dt_bias._no_weight_decay = True
             setattr(self.dt_bias, "tensor_model_parallel", True)
 
             # A parameter
@@ -327,7 +322,6 @@ def __init__(
             ).uniform_(*A_init_range)
             A_log = torch.log(A)  # Keep A_log in fp32
             self.A_log = nn.Parameter(A_log)
-            self.A_log._no_weight_decay = True
             setattr(self.A_log, "tensor_model_parallel", True)
 
         # D "skip" parameter
@@ -337,7 +331,6 @@ def __init__(
                 device=torch.cuda.current_device(),
             )
         )  # Keep in fp32
-        self.D._no_weight_decay = True
         setattr(self.D, "tensor_model_parallel", True)
 
         if self.rmsnorm:
@@ -350,6 +343,7 @@ def __init__(
                 device=torch.cuda.current_device(),
                 dtype=config.params_dtype,
             )
+            setattr(self.norm.weight, "tensor_model_parallel", True)
 
         # Assume sequence parallelism: input is partitioned along d_inner and
         # output is partitioned along the sequence dimension
diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
@@ -347,6 +347,7 @@ def _build_test_env(cls, test_config):
                 fp8="hybrid" if test_config.fp8 else None,
                 fp8_recipe="tensorwise" if test_config.fp8 else None,
                 cuda_graph_scope=test_config.cuda_graph_scope,
+                is_hybrid_model=True,  # Needs to be set for correct out_proj init
             )
 
             # Mamba model.
@@ -557,13 +558,13 @@ def test_simple(self, model_provider, num_cuda_graphs, cuda_graph_scope) -> None
         ]
 
         mamba_expected_generated_tokens = [
-            [74, 72, 83, 59, 1, 70, 15, 89, 30, 52, 82, 70, 64, 16, 83, 5],
-            [25, 54, 42, 57, 33, 64, 60, 13, 28, 74, 8, 4, 56, 68, 87, 82],
-            [31, 55, 77, 25, 96, 13, 32, 49, 40, 54, 73, 10, 50, 2, 64, 96],
-            [72, 80, 35, 72, 77, 85, 98, 36, 4, 97, 37, 46, 79, 95, 83, 85],
-            [8, 80, 56, 4, 87, 1, 15, 98, 85, 7, 31, 38, 91, 28, 18, 80],
-            [9, 94, 48, 60, 87, 57, 25, 76, 91, 34, 69, 86, 73, 24, 63, 97],
-            [17, 5, 62, 66, 15, 52, 32, 75, 66, 18, 69, 5, 67, 37, 94, 51],
+            [74, 72, 9, 59, 1, 70, 15, 89, 30, 52, 82, 70, 64, 16, 83, 5],
+            [25, 54, 28, 14, 87, 27, 60, 92, 28, 74, 8, 63, 60, 68, 87, 82],
+            [31, 21, 87, 25, 96, 13, 32, 49, 40, 54, 55, 68, 73, 2, 64, 96],
+            [72, 80, 35, 72, 77, 85, 98, 36, 4, 97, 37, 46, 79, 95, 83, 25],
+            [8, 80, 56, 4, 87, 1, 43, 98, 85, 7, 50, 38, 24, 28, 18, 80],
+            [9, 94, 36, 16, 87, 57, 25, 76, 64, 92, 47, 86, 73, 72, 71, 97],
+            [17, 5, 62, 66, 15, 52, 32, 75, 66, 18, 90, 14, 67, 37, 94, 33],
             [],
         ]