Add Additional Features of GPT-OSS Model : Lora, Alternating attention, MoE Support (#357)

Shashikant86 · awni · web-flow · commit 6c876ca5d18f · 2025-08-06T14:46:40.000-07:00
* Adde Lora, Alternating attention, MoE suport

* nits

* comment

* comment

* comment

* fix test

---------

Co-authored-by: Awni Hannun &lt;awni@apple.com&gt;
diff --git a/mlx_lm/models/gpt_oss.py b/mlx_lm/models/gpt_oss.py
@@ -16,6 +16,7 @@
 
 @dataclass
 class ModelArgs(BaseModelArgs):
+    model_type: str = "gpt_oss"
     num_hidden_layers: int = 36
     num_local_experts: int = 128
     num_experts_per_tok: int = 4
@@ -29,6 +30,7 @@ class ModelArgs(BaseModelArgs):
     sliding_window: int = 128
     rope_theta: int = 150000
     rope_scaling: Any = None
+    layer_types: list = None
 
 
 # These operators emulate particular methods in torch that don't exist in MLX natively
@@ -47,9 +49,12 @@ def swiglu(x_linear, x_glu, alpha: float = 1.702, limit: float = 7.0):
     # Clamp the input values
     x_glu = mx.clip(x_glu, a_min=None, a_max=limit)
     x_linear = mx.clip(x_linear, a_min=-limit, a_max=limit)
-    glu_scaled = (alpha * x_glu.astype(mx.float32)).astype(mx.bfloat16)
+
+    # Preserve input dtype
+    input_dtype = x_glu.dtype
+    glu_scaled = (alpha * x_glu.astype(mx.float32)).astype(input_dtype)
     negative_glu = (-glu_scaled).astype(mx.float32)
-    sig = (1.0 / (1.0 + mx.exp(negative_glu))).astype(mx.bfloat16)
+    sig = (1.0 / (1.0 + mx.exp(negative_glu))).astype(input_dtype)
 
     out_glu = x_glu * sig
     # Note we add an extra bias of 1 to the linear layer
@@ -168,11 +173,11 @@ def _make_mask(L, offset):
 
         return self._previous_mask[..., : min(L + offset, window_size + 1)]
 
-    def get_mask(self, x, cache, window_size, idx):
-        if idx % 2 == 1:
-            return self.get_causal_mask(x, cache)
-        else:
+    def get_mask(self, x, cache, window_size):
+        if window_size is not None:
             return self.get_sliding_window_mask(x, cache, window_size)
+        else:
+            return self.get_causal_mask(x, cache)
 
     def __call__(self, x: mx.array, mask: mx.array, cache=None) -> mx.array:
         B, L, _ = x.shape
@@ -225,18 +230,15 @@ def __init__(self, config: ModelArgs):
         self.router = nn.Linear(config.hidden_size, config.num_local_experts, bias=True)
 
     def __call__(self, x: mx.array) -> mx.array:
-        x = x.reshape(-1, self.hidden_size)
-
-        # N.B. As elsewhere, upcast is required in linear layers
-        g = self.router(x.astype(mx.float32)).astype(mx.bfloat16)
+        g = self.router(x)
         experts, indices = mlx_topk(g, k=self.num_experts_per_tok, axis=-1)
         expert_weights = mx.softmax(experts, axis=-1, precise=True)
 
         # Experts block
         x = self.experts(x, indices)
 
-        x = x * mx.expand_dims(expert_weights, axis=2)
-        return x.sum(axis=1)
+        x = x * mx.expand_dims(expert_weights, axis=-1)
+        return x.sum(axis=-2)
 
 
 class TransformerBlock(nn.Module):
@@ -267,6 +269,10 @@ def __init__(self, args: ModelArgs):
         super().__init__()
         self.embed_tokens = nn.Embedding(args.vocab_size, args.hidden_size)
         self.norm = nn.RMSNorm(args.hidden_size, args.rms_norm_eps)
+        self.layer_types = args.layer_types or [
+            "sliding_attention",
+            "full_attention",
+        ] * (args.num_hidden_layers // 2)
         self.layers = [TransformerBlock(args) for _ in range(args.num_hidden_layers)]
         self.window_size = args.sliding_window
 
@@ -287,8 +293,10 @@ def __call__(
 
         if mask is None:
             masks = [
-                l.self_attn.get_mask(x, c, self.window_size, i)
-                for i, (l, c) in enumerate(zip(self.layers, cache))
+                l.self_attn.get_mask(
+                    x, c, self.window_size if lt == "sliding_attention" else None
+                )
+                for (l, c, lt) in zip(self.layers, cache, self.layer_types)
             ]
         else:
             masks = [mask] * len(self.layers)
@@ -328,10 +336,9 @@ def convert_moe_packed_tensors(blocks, scales):
     )
 
     *prefix_shape, G, B = blocks.shape
-    rows_total = math.prod(prefix_shape) * G
 
-    blocks = blocks.reshape(rows_total, B)
-    scales = scales.reshape(rows_total, 1)
+    blocks = blocks.reshape(-1, B)
+    scales = scales.reshape(-1, 1)
 
     idx_lo = blocks & 0x0F
     idx_hi = blocks >> 4
@@ -346,9 +353,7 @@ class Model(nn.Module):
     def __init__(self, args: ModelArgs):
         super().__init__()
         self.args = args
-        self.model_type = (
-            args.model_type if hasattr(args, "model_type") else "gpt_oss_moe"
-        )
+        self.model_type = args.model_type
         self.model = GptOssMoeModel(args)
         self.lm_head = nn.Linear(args.hidden_size, args.vocab_size, bias=False)
 
@@ -405,9 +410,8 @@ def layers(self):
 
     def make_cache(self):
         caches = []
-        for i in range(self.args.num_hidden_layers):
-            # full attn on odd indices, swa on even
-            if i % 2 == 1:
+        for lt in self.model.layer_types:
+            if lt == "full_attention":
                 caches.append(KVCache())
             else:
                 caches.append(
diff --git a/mlx_lm/tuner/utils.py b/mlx_lm/tuner/utils.py
@@ -122,6 +122,7 @@ def to_lora(layer):
         "smollm3",
         "exaone4",
         "hunyuan_v1_dense",
+        "gpt_oss",
     }:
         keys = {"self_attn.q_proj", "self_attn.v_proj"}
         if model.model_type in ["mixtral", "phimoe"]:
diff --git a/tests/test_models.py b/tests/test_models.py
@@ -221,7 +221,7 @@ def model_test_runner(self, model, model_type, vocab_size, num_layers):
             self.assertEqual(outputs.shape, (1, 2, vocab_size))
             self.assertEqual(outputs.dtype, t)
 
-            if model_type not in ("mamba", "plamo2"):
+            if model_type not in ("mamba", "plamo2", "gpt_oss"):
                 mask = create_causal_mask(inputs.shape[1], 0).astype(t)
                 outputs = model(inputs, mask=mask)
                 self.assertEqual(outputs.shape, (1, 2, vocab_size))
@@ -1158,6 +1158,33 @@ def test_smollm3(self):
             model, "smollm3", args.vocab_size, args.num_hidden_layers
         )
 
+    def test_gpt_oss(self):
+        from mlx_lm.models import gpt_oss
+
+        args = gpt_oss.ModelArgs(
+            model_type="gpt_oss",
+            hidden_size=1024,
+            num_hidden_layers=4,
+            intermediate_size=2048,
+            num_attention_heads=8,
+            num_key_value_heads=2,
+            num_local_experts=16,
+            num_experts_per_tok=2,
+            sliding_window=128,
+            rope_theta=10000,
+            vocab_size=10_000,
+            layer_types=[
+                "sliding_attention",
+                "full_attention",
+                "sliding_attention",
+                "full_attention",
+            ],
+        )
+        model = gpt_oss.Model(args)
+        self.model_test_runner(
+            model, args.model_type, args.vocab_size, args.num_hidden_layers
+        )
+
 
 if __name__ == "__main__":
     unittest.main()