Add MambaInferenceStateConfig dataclass (#2265)

santhnm2 · web-flow · commit 6c8cdd57de1c · 2025-11-19T22:11:21.000Z
Signed-off-by: Keshav Santhanam &lt;ksanthanam@nvidia.com&gt;
diff --git a/examples/inference/gpt/gpt_dynamic_inference.py b/examples/inference/gpt/gpt_dynamic_inference.py
@@ -30,6 +30,9 @@
     ContextOverflowError,
     DynamicInferenceContext,
 )
+from megatron.core.inference.context.attention_context.mamba_metadata import (
+    MambaInferenceStateConfig,
+)
 from megatron.core.inference.engines import DynamicInferenceEngine, EngineSuspendedError
 from megatron.core.inference.model_inference_wrappers.gpt.gpt_inference_wrapper import (
     GPTInferenceWrapper,
@@ -38,10 +41,9 @@
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
-from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols
 from megatron.core.tokenizers.text.utils.build_tokenizer import build_tokenizer
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.utils import get_attr_wrapped_model
+from megatron.core.utils import get_mamba_inference_state_config_from_model
 
 sys.path.append(
     os.path.abspath(os.path.join(os.path.dirname(__file__), os.path.pardir, os.path.pardir))
@@ -150,9 +152,7 @@ def get_inference_context(
     requests: List[Request],
     sampling_params: Optional[SamplingParams] = None,
     calculate_max_sequence_length_from_requests: bool = True,
-    layer_type_list: Optional[List[str]] = None,
-    mamba_conv_states_shape: Optional[Tuple[int]] = None,
-    mamba_ssm_states_shape: Optional[Tuple[int]] = None,
+    mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None,
 ):
     """The inference context manages the KV cache and other inference state."""
 
@@ -189,9 +189,7 @@ def get_inference_context(
         max_tokens=args.inference_dynamic_batching_max_tokens,
         tensor_model_parallel_size=args.tensor_model_parallel_size,
         materialize_only_last_token_logits=not args.return_log_probs,
-        layer_type_list=layer_type_list,
-        mamba_conv_states_shape=mamba_conv_states_shape,
-        mamba_ssm_states_shape=mamba_ssm_states_shape,
+        mamba_inference_state_config=mamba_inference_state_config,
         cache_mla_latent=args.multi_latent_attention and args.cache_mla_latents,
         kv_lora_rank=args.kv_lora_rank if args.multi_latent_attention else None,
         qk_pos_emb_head_dim=args.qk_pos_emb_head_dim,
@@ -443,23 +441,14 @@ def main():
 
     model = get_model()
 
-    # Layer type list for hybrid models
-    decoder = get_attr_wrapped_model(model, "decoder")
-    layer_type_list = getattr(decoder, "layer_type_list", None)
-    if layer_type_list is not None and Symbols.MAMBA in layer_type_list:
-        (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request()
-    else:
-        mamba_conv_states_shape = None
-        mamba_ssm_states_shape = None
+    mamba_inference_state_config = get_mamba_inference_state_config_from_model(model)
 
     # Requests, context, controller.
     requests = build_requests(args, tokenizer, sampling_params)
     context = get_inference_context(
         requests,
         sampling_params,
-        layer_type_list=layer_type_list,
-        mamba_conv_states_shape=mamba_conv_states_shape,
-        mamba_ssm_states_shape=mamba_ssm_states_shape,
+        mamba_inference_state_config=mamba_inference_state_config,
     )
     controller = get_inference_controller(model, context)
 
diff --git a/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py b/examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py
@@ -30,8 +30,7 @@
 from megatron.core.inference.inference_client import InferenceClient
 from megatron.core.inference.inference_request import DynamicInferenceRequestRecord
 from megatron.core.inference.sampling_params import SamplingParams
-from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols
-from megatron.core.utils import get_attr_wrapped_model
+from megatron.core.utils import get_mamba_inference_state_config_from_model
 
 from megatron.training import get_args, get_tokenizer, initialize_megatron
 from megatron.training.arguments import parse_args
@@ -225,28 +224,16 @@ async def main(
 
         # Requests, context, conroller.
         model = get_model()
+        mamba_inference_state_config = get_mamba_inference_state_config_from_model(model)
         requests = (
             build_requests(args, tokenizer, sampling_params) if dist.get_rank() == 0 else None
         )
 
-        # Layer type list for hybrid models
-        decoder = get_attr_wrapped_model(model, "decoder")
-        layer_type_list = getattr(decoder, "layer_type_list", None)
-        if layer_type_list is not None and Symbols.MAMBA in layer_type_list:
-            (mamba_conv_states_shape, mamba_ssm_states_shape) = (
-                decoder.mamba_state_shapes_per_request()
-            )
-        else:
-            mamba_conv_states_shape = None
-            mamba_ssm_states_shape = None
-
         context = get_inference_context(
             None,
             None,
             calculate_max_sequence_length_from_requests=False,
-            layer_type_list=layer_type_list,
-            mamba_conv_states_shape=mamba_conv_states_shape,
-            mamba_ssm_states_shape=mamba_ssm_states_shape,
+            mamba_inference_state_config=mamba_inference_state_config,
         )
 
         controller = get_inference_controller(model, context)
diff --git a/megatron/core/inference/contexts/attention_context/mamba_metadata.py b/megatron/core/inference/contexts/attention_context/mamba_metadata.py
@@ -1,8 +1,28 @@
 # Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 
+from dataclasses import dataclass
+from typing import List, Optional, Tuple
+
 import torch
 
 
+@dataclass
+class MambaInferenceStateConfig:
+    """Config for initializing Mamba model inference state tensors."""
+
+    layer_type_list: List[str]
+    """
+    A list of strings that indicates the layer type (Mamba / Attention / MLP) for each layer.
+    See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list of symbols.
+    """
+
+    mamba_conv_states_shape: Tuple[int]
+    """Mamba conv states shape per request."""
+
+    mamba_ssm_states_shape: Tuple[int]
+    """Mamba ssm states shape per request."""
+
+
 class MambaMetadata:
     """Manages the metadata tensors required for Mamba layers during inference."""
 
@@ -64,7 +84,7 @@ def update_cudagraph_mapping(
         """
         self.request_to_mamba_state_idx_cudagraph_only[0:num_active_requests] = active_mamba_indices
 
-    def allocate_slot(self) -> int:
+    def allocate_slot(self) -> Optional[int]:
         """
         Allocates a new slot for a request in the Mamba state buffers.
 
diff --git a/megatron/core/inference/contexts/dynamic_context.py b/megatron/core/inference/contexts/dynamic_context.py
@@ -24,14 +24,11 @@
 from megatron.core.inference.utils import tensor_swap
 from megatron.core.models.common.embeddings.rope_utils import apply_rotary_pos_emb
 from megatron.core.package_info import __version__ as mcore_version
-from megatron.core.ssm.mamba_hybrid_layer_allocation import (
-    Symbols,
-    get_layer_maps_from_layer_type_list,
-)
+from megatron.core.ssm.mamba_hybrid_layer_allocation import get_layer_maps_from_layer_type_list
 from megatron.core.transformer import TransformerConfig
 from megatron.core.utils import divide as core_divide
 
-from .attention_context.mamba_metadata import MambaMetadata
+from .attention_context.mamba_metadata import MambaInferenceStateConfig, MambaMetadata
 from .attention_context.mha_metadata import GraphedMHAMetadata, NonGraphedMHAMetadata
 from .base_context import BaseInferenceContext
 from .dynamic_block_allocator import BlockAllocator
@@ -231,14 +228,8 @@ class DynamicInferenceContext(BaseInferenceContext):
         materialize_only_last_token_logits (Optional[bool]): Whether to only
             materialize logits for the last token. This should be set to False
             if returning log probs.
-        layer_type_list (Optional[List[str]]): A list of strings that indicates
-            the layer type (Mamba / Attention / MLP) for each layer.
-            See `megatron/core/ssm/mamba_hybrid_layer_allocation.py` for the list
-            of symbols. This must be provided for hybrid models.
-        mamba_conv_states_shape: (Optional[Tuple[int]]): Mamba conv states shape per request.
-            This must be provided for hybrid models.
-        mamba_ssm_states_shape: (Optional[Tuple[int]]): Mamba ssm states shape per request.
-            This must be provided for hybrid models.
+        mamba_inference_state_config (Optional[MambaInferenceStateConfig]): The Mamba
+            inference state config if the model is a hybrid model.
         use_cuda_graphs_for_non_decode_steps (bool): If True, use cuda graphs for non-decode
             engine steps.
         unified_memory_level (Optional[int]): Set unified memory usage within the
@@ -274,9 +265,7 @@ def __init__(
         qk_pos_emb_head_dim: Optional[int] = None,
         num_cuda_graphs: Optional[int] = None,
         materialize_only_last_token_logits: Optional[bool] = True,
-        layer_type_list: Optional[List[str]] = None,
-        mamba_conv_states_shape: Optional[Tuple[int]] = None,
-        mamba_ssm_states_shape: Optional[Tuple[int]] = None,
+        mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None,
         use_cuda_graphs_for_non_decode_steps: bool = True,
         use_flashinfer_fused_rope: bool = False,
         unified_memory_level: Optional[int] = 1,
@@ -303,8 +292,10 @@ def __init__(
         self.num_attention_heads_per_partition = core_divide(num_attention_heads, tp_size)
 
         # Mamba states.
-        self.is_hybrid_model = layer_type_list is not None and Symbols.MAMBA in layer_type_list
+        self.is_hybrid_model = mamba_inference_state_config is not None
         if self.is_hybrid_model:
+            mamba_conv_states_shape = mamba_inference_state_config.mamba_conv_states_shape
+            mamba_ssm_states_shape = mamba_inference_state_config.mamba_ssm_states_shape
             assert (
                 mamba_conv_states_shape is not None
             ), "`mamba_conv_states_shape` must be specified for hybrid models"
@@ -319,7 +310,7 @@ def __init__(
             # corresponding attention layer index or Mamba layer index depending on the
             # layer type.
             attention_layer_map, mamba_layer_map, _, _ = get_layer_maps_from_layer_type_list(
-                layer_type_list
+                mamba_inference_state_config.layer_type_list
             )
             self.num_attention_layers = len(attention_layer_map)
             self.num_mamba_layers = len(mamba_layer_map)
@@ -728,6 +719,7 @@ def from_config(
         max_batch_size: int,
         buffer_size_gb: float = 40,
         num_cuda_graphs: int = None,
+        mamba_inference_state_config: Optional[MambaInferenceStateConfig] = None,
     ):
         """
         Instantiate a `DynamicInferenceContext` from a `TransformerConfig` and an `InferenceWrapperConfig`.
@@ -749,6 +741,7 @@ def from_config(
             materialize_only_last_token_logits=False,
             num_cuda_graphs=num_cuda_graphs,
             use_flashinfer_fused_rope=None,
+            mamba_inference_state_config=mamba_inference_state_config,
         )
 
     @classmethod
diff --git a/megatron/core/inference/engines/static_engine.py b/megatron/core/inference/engines/static_engine.py
@@ -17,7 +17,7 @@
 from megatron.core.inference.text_generation_controllers.text_generation_controller import (
     TextGenerationController,
 )
-from megatron.core.utils import get_asyncio_loop
+from megatron.core.utils import get_asyncio_loop, get_mamba_inference_state_config_from_model
 
 try:
     from tqdm import tqdm
@@ -93,6 +93,10 @@ def __init__(
         # Store original context in case we need to fall back to legacy static engine
         original_context = text_generation_controller.inference_wrapped_model.inference_context
 
+        mamba_inference_state_config = get_mamba_inference_state_config_from_model(
+            text_generation_controller.inference_wrapped_model.model
+        )
+
         try:
             if not legacy:
                 dynamic_context = DynamicInferenceContext.from_config(
@@ -101,6 +105,7 @@ def __init__(
                     max_batch_size=max_batch_size,
                     buffer_size_gb=buffer_size_gb,
                     num_cuda_graphs=1,
+                    mamba_inference_state_config=mamba_inference_state_config,
                 )
                 self.controller.inference_wrapped_model.inference_context = dynamic_context
                 self.controller.inference_wrapped_model.prep_model_for_inference()
diff --git a/megatron/core/utils.py b/megatron/core/utils.py
@@ -2154,6 +2154,25 @@ async def wrapper(*args, **kwargs):
     return _decorate if func is None else _decorate(func)
 
 
+def get_mamba_inference_state_config_from_model(model) -> Optional["MambaInferenceStateConfig"]:
+    """Returns Mamba inference state config from the model if it is a hybrid model."""
+    from megatron.core.inference.contexts.attention_context.mamba_metadata import (
+        MambaInferenceStateConfig,
+    )
+    from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols
+
+    decoder = get_attr_wrapped_model(model, "decoder")
+    layer_type_list = getattr(decoder, "layer_type_list", None)
+    if layer_type_list is not None and Symbols.MAMBA in layer_type_list:
+        (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request()
+        return MambaInferenceStateConfig(
+            layer_type_list=layer_type_list,
+            mamba_conv_states_shape=mamba_conv_states_shape,
+            mamba_ssm_states_shape=mamba_ssm_states_shape,
+        )
+    return None
+
+
 # ============================================================================
 # Backward Compatibility Decorators
 # ============================================================================
diff --git a/megatron/rl/inference/megatron.py b/megatron/rl/inference/megatron.py
@@ -25,7 +25,7 @@
 from megatron.core.models.gpt.gpt_model import GPTModel
 from megatron.core.ssm.mamba_hybrid_layer_allocation import Symbols
 from megatron.core.transformer.module import MegatronModule
-from megatron.core.utils import get_attr_wrapped_model, log_single_rank
+from megatron.core.utils import get_mamba_inference_state_config_from_model, log_single_rank
 from megatron.training.global_vars import get_args, get_tokenizer
 
 from ..inference.inference_interface import (
@@ -107,14 +107,7 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen
     if args.enable_cuda_graph:
         num_cuda_graphs = args.inference_dynamic_batching_num_cuda_graphs
 
-    # Layer type list for hybrid models
-    decoder = get_attr_wrapped_model(model, "decoder")
-    layer_type_list = getattr(decoder, "layer_type_list", None)
-    if layer_type_list is not None and Symbols.MAMBA in layer_type_list:
-        (mamba_conv_states_shape, mamba_ssm_states_shape) = decoder.mamba_state_shapes_per_request()
-    else:
-        mamba_conv_states_shape = None
-        mamba_ssm_states_shape = None
+    mamba_inference_state_config = get_mamba_inference_state_config_from_model(model)
 
     # Inference context.
     inference_context = DynamicInferenceContext(
@@ -135,9 +128,7 @@ def get_dynamic_inference_engine(args: Namespace, model: MegatronModule, inferen
         tensor_model_parallel_size=args.tensor_model_parallel_size,
         materialize_only_last_token_logits=True,
         unified_memory_kvcache=args.inference_dynamic_batching_unified_memory_kvcache,
-        layer_type_list=layer_type_list,
-        mamba_conv_states_shape=mamba_conv_states_shape,
-        mamba_ssm_states_shape=mamba_ssm_states_shape,
+        mamba_inference_state_config=mamba_inference_state_config,
         metrics_writer=metrics_writer,
     )
 
diff --git a/tests/unit_tests/inference/contexts/test_dynamic_context.py b/tests/unit_tests/inference/contexts/test_dynamic_context.py
@@ -5,6 +5,9 @@
 import pytest
 import torch
 
+from megatron.core.inference.contexts.attention_context.mamba_metadata import (
+    MambaInferenceStateConfig,
+)
 from megatron.core.inference.contexts.dynamic_context import (
     DynamicInferenceContext,
     RequestOverflowError,
@@ -52,8 +55,16 @@ def _get_dynamic_context(
     ):
         set_rounder(rounder)
 
-        if is_hybrid_model and layer_type_list is None:
-            layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP]
+        if is_hybrid_model:
+            if layer_type_list is None:
+                layer_type_list = [Symbols.MAMBA, Symbols.MLP, Symbols.ATTENTION, Symbols.MLP]
+            mamba_conv_states_shape = (544, 4)
+            mamba_ssm_states_shape = (8, 64, 16)
+            mamba_inference_state_config = MambaInferenceStateConfig(
+                layer_type_list, mamba_conv_states_shape, mamba_ssm_states_shape
+            )
+        else:
+            mamba_inference_state_config = None
 
         dynamic_context = DynamicInferenceContext(
             params_dtype=params_dtype,
@@ -66,9 +77,7 @@ def _get_dynamic_context(
             buffer_size_gb=buffer_size_gb,
             block_size_tokens=block_size_tokens,
             max_tokens=max_tokens,
-            layer_type_list=layer_type_list,
-            mamba_conv_states_shape=(544, 4),
-            mamba_ssm_states_shape=(8, 64, 16),
+            mamba_inference_state_config=mamba_inference_state_config,
             use_flashinfer_fused_rope=None,  # default to using flash-infer if available
             # this is for compatibility with the LTS environment
             unified_memory_level=0,  # unit tests currently broken with UVM
diff --git a/tests/unit_tests/inference/engines/test_dynamic_engine.py b/tests/unit_tests/inference/engines/test_dynamic_engine.py
diff --git a/tools/run_inference_performance_test.py b/tools/run_inference_performance_test.py