unslothai
diff --git a/‎unsloth/kernels/rms_layernorm.py‎
Lines changed: 2 additions & 1 deletion b/‎unsloth/kernels/rms_layernorm.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎unsloth/models/_utils.py‎
100644100755
Lines changed: 98 additions & 0 deletions b/‎unsloth/models/_utils.py‎
100644100755
Lines changed: 98 additions & 0 deletions
diff --git a/‎unsloth/models/gemma.py‎
100644100755
Lines changed: 25 additions & 20 deletions b/‎unsloth/models/gemma.py‎
100644100755
Lines changed: 25 additions & 20 deletions
@@ -147,7 +147,8 @@ def _gemma_rms_layernorm_forward(
     W_row = tl.load(W + col_offsets, mask = mask, other = 0).to(tl.float32)
 
     row_var = tl.sum(X_row * X_row, axis = 0) / n_cols
-    inv_var = tl.math.rsqrt(row_var + eps)
+    eps_f32 = tl.full((), eps, tl.float32)
+    inv_var = tl.math.rsqrt(row_var + eps_f32)
     tl.store(r, inv_var)
     normed = X_row * inv_var
     output = normed * (W_row + 1.0)
 
@@ -1842,6 +1842,104 @@ def unsloth_compile_transformers(
             return_logits = return_logits,
             supports_sdpa = supports_sdpa,
         )
+
+    # After compilation, patch GPT-OSS experts on HIP/Strix-safe machines so
+    # that expert matmuls use matching dtypes (avoids float vs bfloat16).
+    if DEVICE_TYPE == "hip" and os.environ.get("UNSLOTH_STRIX_HALO_SAFE", "0") == "1":
+        try:
+            import unsloth_compiled_module_gpt_oss as _gpt_oss_compiled
+
+            @torch.compiler.disable(recursive = False)
+            def GptOssExperts_forward_safe(
+                self,
+                hidden_states: torch.Tensor,
+                router_indices = None,
+                routing_weights = None,
+            ) -> torch.Tensor:
+                batch_size = hidden_states.shape[0]
+                hidden_states = hidden_states.reshape(-1, self.hidden_size)
+                num_experts = routing_weights.shape[1]
+
+                if hidden_states.device.type == "cpu" or self.training:
+                    next_states = torch.zeros_like(
+                        hidden_states,
+                        dtype = hidden_states.dtype,
+                        device = hidden_states.device,
+                    )
+                    with torch.no_grad():
+                        expert_mask = torch.nn.functional.one_hot(
+                            router_indices,
+                            num_classes = num_experts + 1,
+                        )
+                        expert_mask = expert_mask.permute(2, 1, 0)
+                        expert_hit = torch.greater(
+                            expert_mask.sum(dim = (-1, -2)), 0
+                        ).nonzero()
+
+                    for expert_idx in expert_hit[:]:
+                        expert_idx = expert_idx[0]
+                        if expert_idx == num_experts:
+                            continue
+                        with torch.no_grad():
+                            _, token_idx = torch.where(expert_mask[expert_idx])
+                        current_state = hidden_states[token_idx]
+                        gate_up = (
+                            current_state @ self.gate_up_proj[expert_idx]
+                            + self.gate_up_proj_bias[expert_idx]
+                        )
+                        gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                        gate = gate.clamp(min = None, max = self.limit)
+                        up = up.clamp(min = -self.limit, max = self.limit)
+                        glu = gate * torch.sigmoid(gate * self.alpha)
+                        gated_output = (up + 1) * glu
+
+                        # Ensure matmul uses a consistent dtype
+                        w = self.down_proj[expert_idx]
+                        gated_output = gated_output.to(w.dtype)
+                        out = gated_output @ w + self.down_proj_bias[expert_idx]
+
+                        weighted_output = (
+                            out * routing_weights[token_idx, expert_idx, None]
+                        )
+                        next_states.index_add_(
+                            0,
+                            token_idx,
+                            weighted_output.to(hidden_states.dtype),
+                        )
+                    next_states = next_states.view(batch_size, -1, self.hidden_size)
+                else:
+                    hidden_states = hidden_states.repeat(num_experts, 1)
+                    hidden_states = hidden_states.view(
+                        num_experts,
+                        -1,
+                        self.hidden_size,
+                    )
+                    gate_up = (
+                        torch.bmm(hidden_states, self.gate_up_proj)
+                        + self.gate_up_proj_bias[..., None, :]
+                    )
+                    gate, up = gate_up[..., ::2], gate_up[..., 1::2]
+                    gate = gate.clamp(min = None, max = self.limit)
+                    up = up.clamp(min = -self.limit, max = self.limit)
+                    glu = gate * torch.sigmoid(gate * self.alpha)
+                    next_states = torch.bmm(((up + 1) * glu), self.down_proj)
+                    next_states = next_states + self.down_proj_bias[..., None, :]
+                    next_states = next_states.view(
+                        num_experts,
+                        batch_size,
+                        -1,
+                        self.hidden_size,
+                    )
+                    next_states = next_states * routing_weights.transpose(
+                        0, 1
+                    ).view(num_experts, batch_size, -1)[..., None]
+                    next_states = next_states.sum(dim = 0)
+                return next_states
+
+            _gpt_oss_compiled.GptOssExperts_forward = GptOssExperts_forward_safe
+            del _gpt_oss_compiled
+        except Exception:
+            pass
     # Redo patches which override compiler
     for temporary_patch in TEMPORARY_PATCHES:
         temporary_patch()
 
@@ -19,9 +19,8 @@
 import math
 import os
 
-_DISABLE_TRITON_RMSNORM = os.getenv("UNSLOTH_DISABLE_TRITON_RMSNORM", "0") == "1"
-_LAYERNORM_IMPL = os.getenv("UNSLOTH_LAYERNORM_IMPL", "").lower()
-_DISABLE_AUTODTYPE_CAST = os.getenv("UNSLOTH_DISABLE_AUTODTYPE_CAST", "0") == "1"
+_STRIX_HALO_SAFE = os.getenv("UNSLOTH_STRIX_HALO_SAFE", "0") == "1"
+_GEMMA_STRIX_SAFE = ("hip" == DEVICE_TYPE) and _STRIX_HALO_SAFE
 
 try:
     from transformers.models.gemma.modeling_gemma import (
@@ -123,16 +122,18 @@ def GemmaDecoderLayer_fast_forward(
         hidden_states = fast_rms_layernorm_inference_gemma(
             self.post_attention_layernorm, hidden_states, out_weight
         )
-        hidden_states = fast_geglu_inference(self.mlp, hidden_states)
-        hidden_states += residual
+        if _GEMMA_STRIX_SAFE:
+            mlp_in = hidden_states.to(torch.float32)
+            mlp_out = self.mlp(mlp_in)
+            hidden_states = residual + mlp_out.to(hidden_states.dtype)
+        else:
+            hidden_states = fast_geglu_inference(self.mlp, hidden_states)
+            hidden_states += residual
     else:
         residual = hidden_states
-        if _DISABLE_TRITON_RMSNORM or _LAYERNORM_IMPL == "python":
-            hidden_states = self.input_layernorm(hidden_states)
-        else:
-            hidden_states = fast_rms_layernorm(
-                self.input_layernorm, hidden_states, gemma = True
-            )
+        hidden_states = fast_rms_layernorm(
+            self.input_layernorm, hidden_states, gemma = True
+        )
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states = hidden_states,
             causal_mask = causal_mask,
@@ -147,14 +148,19 @@ def GemmaDecoderLayer_fast_forward(
 
         # Fully Connected
         residual = hidden_states
-        if _DISABLE_TRITON_RMSNORM or _LAYERNORM_IMPL == "python":
-            hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = fast_rms_layernorm(
+            self.post_attention_layernorm, hidden_states, gemma = True
+        )
+
+        # On Strix Halo (HIP) in safe mode, run Gemma MLP in float32 for
+        # numerical stability, then cast back. Else use the default path.
+        if _GEMMA_STRIX_SAFE:
+            mlp_in = hidden_states.to(torch.float32)
+            mlp_out = self.mlp(mlp_in)
+            hidden_states = residual + mlp_out.to(hidden_states.dtype)
         else:
-            hidden_states = fast_rms_layernorm(
-                self.post_attention_layernorm, hidden_states, gemma = True
-            )
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
+            hidden_states = self.mlp(hidden_states)
+            hidden_states = residual + hidden_states
 
     outputs = (hidden_states,)
     if output_attentions:
@@ -186,8 +192,7 @@ def GemmaModel_fast_forward_inference(
     )
     input_ids = input_ids[:, : self.max_seq_length]
     hidden_states = self.model.embed_tokens(input_ids)
-    if not _DISABLE_AUTODTYPE_CAST:
-        hidden_states = hidden_states.to(_get_dtype(dtype_from_config(self.config)))
+    hidden_states = hidden_states.to(_get_dtype(dtype_from_config(self.config)))
     # 3072**0.5 = 55.5000 in bfloat16, whilst 55.4256 in float32
     # 2048**0.5 = 45.2500 in bfloat16, whilst 45.2548 in float32
     hidden_states *= torch.tensor(