feat: Refactor granitemoehybrid to support dense and non-hybrid variants (#518)

gabe-l-hart · awni · web-flow · commit b264da760219 · 2025-10-02T07:21:08.000-07:00
* feat: Refactor granitemoehybrid to support dense and non-hybrid variants

Written with Claude Code. Initial prompt:

I need to modify the model support implemented in `mlx_lm/models/granitemoehybrid.py` in two ways:

* Support optionally using a dense block in place of MoE. The dense block should look like `mlx_lm/models/granite.py` instead of `mlx_lm/models/granitemoe.py`.

* Support the case where there are no `mamba` layers (ie non-hybrid). This should devolve to exactly `granite.py` or `granitemoe.py` depending on whether the block after attention is dense or MoE.

You can test this using the following two models:

* Dense w/ hybrid: /Users/ghart/models/dmf_models/granite-4.0-h-micro-r250918a
* Dense w/ non-hybrid: /Users/ghart/models/dmf_models/granite-4.0-micro-r250918a

Branch: GraniteFourDense

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* refact: Clean up Claude's code a bit

Branch: GraniteFourDense

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* style: pre-commit format

Branch: GraniteFourDense

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;

* version bump

---------

Signed-off-by: Gabe Goodhart &lt;ghart@us.ibm.com&gt;
Co-authored-by: Awni Hannun &lt;awni@apple.com&gt;
diff --git a/mlx_lm/_version.py b/mlx_lm/_version.py
@@ -1,3 +1,3 @@
 # Copyright © 2023-2025 Apple Inc.
 
-__version__ = "0.28.1"
+__version__ = "0.28.2"
diff --git a/mlx_lm/models/granitemoehybrid.py b/mlx_lm/models/granitemoehybrid.py
@@ -20,6 +20,7 @@
 
 @dataclass
 class ModelArgs(BaseModelArgs):
+    # Required fields (no defaults)
     model_type: str
     vocab_size: int
     hidden_size: int
@@ -29,34 +30,42 @@ class ModelArgs(BaseModelArgs):
     num_attention_heads: int
     num_key_value_heads: int
     attention_bias: bool
-
-    # Scalar multipliers
     embedding_multiplier: float
     attention_multiplier: float
     logits_scaling: float
     residual_multiplier: float
-
-    # MoE parameters
-    num_local_experts: int
-    num_experts_per_tok: int
-    shared_intermediate_size: int
-
-    # Mamba parameters
-    mamba_n_heads: int
-    mamba_d_head: int
-    mamba_proj_bias: bool
-    mamba_d_state: int
-    mamba_d_conv: int
-    mamba_n_groups: int
-    mamba_conv_bias: bool
-
     layer_types: List[str]
     rms_norm_eps: float
     rope_theta: float
+
+    # Optional fields (with defaults)
+    # MoE parameters (optional for dense mode)
+    num_local_experts: Optional[int] = None
+    num_experts_per_tok: Optional[int] = None
+    shared_intermediate_size: Optional[int] = None
+
+    # Mamba parameters (optional for non-hybrid mode)
+    mamba_n_heads: Optional[int] = None
+    mamba_d_head: Optional[int] = None
+    mamba_proj_bias: Optional[bool] = None
+    mamba_d_state: Optional[int] = None
+    mamba_d_conv: Optional[int] = None
+    mamba_n_groups: Optional[int] = None
+    mamba_conv_bias: Optional[bool] = None
+
+    # Dense MLP parameters (for non-MoE mode)
+    mlp_bias: bool = False
+
+    # Other optional parameters
     position_embedding_type: str = "rope"
     tie_word_embeddings: bool = True
     time_step_limit: Tuple[float, float] = (0.001, 100.0)
 
+    # Mode flags - inferred from num_local_experts
+    @property
+    def use_moe(self) -> bool:
+        return bool(self.num_local_experts)
+
 
 class GraniteMoeHybridRMSNormGated(nn.Module):
     def __init__(self, hidden_size: int, eps: float = 1e-6):
@@ -314,11 +323,27 @@ def __call__(self, x: mx.array) -> mx.array:
         return self.output_linear(nn.silu(gate) * up)
 
 
+class GraniteMoeHybridMLP(nn.Module):
+    def __init__(self, args: ModelArgs):
+        super().__init__()
+        dim = args.hidden_size
+        hidden_dim = args.intermediate_size
+        mlp_bias = args.mlp_bias
+
+        self.gate_proj = nn.Linear(dim, hidden_dim, bias=mlp_bias)
+        self.down_proj = nn.Linear(hidden_dim, dim, bias=mlp_bias)
+        self.up_proj = nn.Linear(dim, hidden_dim, bias=mlp_bias)
+
+    def __call__(self, x) -> mx.array:
+        return self.down_proj(nn.silu(self.gate_proj(x)) * self.up_proj(x))
+
+
 class GraniteMoeHybridLayer(nn.Module):
     def __init__(self, args: ModelArgs, layer_type: str):
         super().__init__()
         self.layer_type = layer_type
         self.residual_multiplier = args.residual_multiplier
+        self.use_moe = args.use_moe
 
         self.input_layernorm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
 
@@ -329,8 +354,14 @@ def __init__(self, args: ModelArgs, layer_type: str):
         else:
             raise ValueError(f"Unknown layer type: {layer_type}")
 
-        self.shared_mlp = GraniteMoeHybridSharedMLP(args)
-        self.block_sparse_moe = GraniteMoeHybridMoE(args)
+        # MoE or dense MLP after attention/mamba
+        if self.use_moe:
+            self.shared_mlp = GraniteMoeHybridSharedMLP(args)
+            self.block_sparse_moe = GraniteMoeHybridMoE(args)
+        else:
+            # Dense MLP mode
+            self.mlp = GraniteMoeHybridMLP(args)
+
         self.post_attention_layernorm = nn.RMSNorm(
             args.hidden_size, eps=args.rms_norm_eps
         )
@@ -352,13 +383,16 @@ def __call__(
 
         hidden_states = residual + hidden_states * self.residual_multiplier
 
-        # Second block: MoE + shared_mlp
+        # Second block: MoE + shared_mlp OR dense MLP
         residual = hidden_states
         normed = self.post_attention_layernorm(hidden_states)
 
-        moe_out = self.block_sparse_moe(normed)
-        shared_out = self.shared_mlp(normed)
-        mlp_out = moe_out + shared_out
+        if self.use_moe:
+            moe_out = self.block_sparse_moe(normed)
+            shared_out = self.shared_mlp(normed)
+            mlp_out = moe_out + shared_out
+        else:
+            mlp_out = self.mlp(normed)
 
         hidden_states = residual + mlp_out * self.residual_multiplier
 
@@ -375,9 +409,16 @@ def __init__(self, args: ModelArgs):
         ]
         self.norm = nn.RMSNorm(args.hidden_size, eps=args.rms_norm_eps)
         self.embedding_multiplier = args.embedding_multiplier
-        self.fa_idx = args.layer_types.index("attention")
-        self.ssm_idx = args.layer_types.index("mamba")
-        self.layer_types = args.layer_types
+
+        # Handle hybrid vs non-hybrid mode
+        self.fa_idx = (
+            args.layer_types.index("attention")
+            if "attention" in args.layer_types
+            else None
+        )
+        self.ssm_idx = (
+            args.layer_types.index("mamba") if "mamba" in args.layer_types else None
+        )
 
     def __call__(
         self,
@@ -389,11 +430,16 @@ def __call__(
         if cache is None:
             cache = [None] * len(self.layers)
 
-        attn_mask = create_attention_mask(hidden_states, cache[self.fa_idx])
-        mamba_mask = create_ssm_mask(hidden_states, cache[self.ssm_idx])
+        # Create masks based on what layer types exist
+        attn_mask = None
+        mamba_mask = None
+
+        if self.fa_idx is not None:
+            attn_mask = create_attention_mask(hidden_states, cache[self.fa_idx])
+        if self.ssm_idx is not None:
+            mamba_mask = create_ssm_mask(hidden_states, cache[self.ssm_idx])
 
-        cache_counter = 0
-        for layer, c, layer_type in zip(self.layers, cache, self.layer_types):
+        for layer, c in zip(self.layers, cache):
             mask = attn_mask if layer.layer_type == "attention" else mamba_mask
             hidden_states = layer(hidden_states, mask=mask, cache=c)
 
@@ -443,8 +489,11 @@ def sanitize(self, weights):
             if "conv1d.weight" in k and v.shape[-1] != 1:
                 weights[k] = v.moveaxis(2, 1)
 
-        # Handle MoE weight transformation to SwitchGLU format
-        if "model.layers.0.block_sparse_moe.input_linear.weight" in weights:
+        # Handle MoE weight transformation to SwitchGLU format (only for MoE models)
+        if (
+            self.args.use_moe
+            and "model.layers.0.block_sparse_moe.input_linear.weight" in weights
+        ):
             for l in range(self.args.num_hidden_layers):
                 prefix = f"model.layers.{l}.block_sparse_moe"
 
@@ -461,12 +510,31 @@ def sanitize(self, weights):
                     f"{prefix}.output_linear.weight"
                 )
 
+        # Handle dense MLP weight transformation (for dense models)
+        elif (
+            not self.args.use_moe
+            and "model.layers.0.shared_mlp.input_linear.weight" in weights
+        ):
+            for l in range(self.args.num_hidden_layers):
+                prefix = f"model.layers.{l}.shared_mlp"
+
+                # Transform shared_mlp weights to standard mlp weights
+                input_weight = weights.pop(f"{prefix}.input_linear.weight")
+                # Split into gate and up projections (each half)
+                gate_proj, up_proj = mx.split(input_weight, 2, axis=0)
+                weights[f"model.layers.{l}.mlp.gate_proj.weight"] = gate_proj
+                weights[f"model.layers.{l}.mlp.up_proj.weight"] = up_proj
+
+                weights[f"model.layers.{l}.mlp.down_proj.weight"] = weights.pop(
+                    f"{prefix}.output_linear.weight"
+                )
+
         return weights
 
     @property
     def quant_predicate(self):
         def predicate(path, _):
-            if path.endswith("router.layer"):
+            if self.args.use_moe and path.endswith("router.layer"):
                 return {"group_size": 64, "bits": 8}
             return True
 

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,3 @@`
`1`	`1`	`# Copyright © 2023-2025 Apple Inc.`
`2`	`2`
`3`		`-__version__ = "0.28.1"`
	`3`	`+__version__ = "0.28.2"`