[Dev release cherry pick] Fixes for gpt-oss (#2076)

cuichenx · web-flow · commit b1c616c9c2a0 · 2025-11-05T17:25:21.000-06:00
Signed-off-by: Chen Cui &lt;chcui@nvidia.com&gt;
diff --git a/megatron/core/models/common/embeddings/rope_utils.py b/megatron/core/models/common/embeddings/rope_utils.py
@@ -268,46 +268,47 @@ def apply_rotary_pos_emb(
     if config.apply_rope_fusion:
         if cu_seqlens is None:
             # NOTE: TE backends do not support mRoPE in bshd format when bs > 1.
+            use_unfused = False
             if config.mrope_section is not None and freqs.shape[1] > 1:
                 # TODO: Add a check in TransformerConfig and remove this unfused implementation.
                 warnings.warn(
                     "apply_rope_fusion does not support mRoPE in bshd format when bs > 1. "
                     "Please set apply_rope_fusion to false. This will become an error in v0.16."
                 )
-                return _apply_rotary_pos_emb_bshd(
-                    t,
-                    freqs,
-                    rotary_interleaved=config.rotary_interleaved,
-                    multi_latent_attention=config.multi_latent_attention,
-                    mscale=mscale,
+                use_unfused = True
+            if mscale != 1.0:
+                warnings.warn(
+                    f"mscale={mscale} is not supported by TE's fused RoPE. "
+                    "Using unfused implementation."
                 )
-            else:
+                use_unfused = True
+            if not use_unfused:
                 assert fused_apply_rotary_pos_emb is not None, "apply_rope_fusion is not available."
                 return fused_apply_rotary_pos_emb(t, freqs, interleaved=config.rotary_interleaved)
         else:
             assert fused_apply_rotary_pos_emb_thd is not None, "apply_rope_fusion is not available."
             return fused_apply_rotary_pos_emb_thd(
                 t, cu_seqlens, freqs, cp_size=cp_group.size(), cp_rank=cp_group.rank()
             )
+    # use unfused implementation
+    if cu_seqlens is None:
+        return _apply_rotary_pos_emb_bshd(
+            t,
+            freqs,
+            rotary_interleaved=config.rotary_interleaved,
+            multi_latent_attention=config.multi_latent_attention,
+            mscale=mscale,
+        )
     else:
-        if cu_seqlens is None:
-            return _apply_rotary_pos_emb_bshd(
-                t,
-                freqs,
-                rotary_interleaved=config.rotary_interleaved,
-                multi_latent_attention=config.multi_latent_attention,
-                mscale=mscale,
-            )
-        else:
-            return _apply_rotary_pos_emb_thd(
-                t,
-                cu_seqlens,
-                freqs,
-                rotary_interleaved=config.rotary_interleaved,
-                multi_latent_attention=config.multi_latent_attention,
-                mscale=mscale,
-                cp_group=cp_group,
-            )
+        return _apply_rotary_pos_emb_thd(
+            t,
+            cu_seqlens,
+            freqs,
+            rotary_interleaved=config.rotary_interleaved,
+            multi_latent_attention=config.multi_latent_attention,
+            mscale=mscale,
+            cp_group=cp_group,
+        )
 
 
 def apply_rotary_pos_emb_with_cos_sin(
diff --git a/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py b/megatron/core/models/common/embeddings/yarn_rotary_pos_embedding.py
@@ -228,22 +228,25 @@ def _yarn_get_mscale(scale: float = 1, mscale: float = 1) -> float:
 
 @lru_cache(maxsize=8)
 def _yarn_get_concentration_factor(
-    scaling_factor: float, mscale: float, mscale_all_dim: float
+    scaling_factor: float, mscale: Optional[float], mscale_all_dim: Optional[float]
 ) -> float:
     """
     Get the concentration factor (factor multiplied to the sine and cosine components of the
     embedding). This factor is also known as attention factor, and sometimes homonymously known as
     "mscale"
     """
+    if mscale is None or mscale_all_dim is None:
+        return _yarn_get_mscale(scaling_factor)
     return float(
         _yarn_get_mscale(scaling_factor, mscale) / _yarn_get_mscale(scaling_factor, mscale_all_dim)
     )
 
 
 def _yarn_get_concentration_factor_from_config(config: TransformerConfig) -> float:
-    fields = ["yarn_rotary_scaling_factor", "yarn_mscale", "yarn_mscale_all_dim"]
-    if all(hasattr(config, f) for f in fields):
+    if hasattr(config, "yarn_rotary_scaling_factor"):
         return _yarn_get_concentration_factor(
-            config.yarn_rotary_scaling_factor, config.yarn_mscale, config.yarn_mscale_all_dim
+            config.yarn_rotary_scaling_factor,
+            getattr(config, "yarn_mscale", None),
+            getattr(config, "yarn_mscale_all_dim", None),
         )
     return 1.0
diff --git a/megatron/core/transformer/dot_product_attention.py b/megatron/core/transformer/dot_product_attention.py
@@ -116,13 +116,19 @@ def __init__(
         if self.config.softmax_type == "vanilla":
             self.softmax_offset = None
         elif self.config.softmax_type == "off-by-one":
-            self.softmax_offset = torch.zeros(self.num_attention_heads_per_partition)
+            self.softmax_offset = torch.zeros(
+                self.num_attention_heads_per_partition,
+                device=torch.cuda.current_device(),
+                dtype=self.config.params_dtype,
+            )
         elif self.config.softmax_type == "learnable":
             self.register_parameter(
                 "softmax_offset",
                 torch.nn.Parameter(
                     torch.empty(
-                        self.num_attention_heads_per_partition, dtype=self.config.params_dtype
+                        self.num_attention_heads_per_partition,
+                        device=torch.cuda.current_device(),
+                        dtype=self.config.params_dtype,
                     )
                 ),
             )
diff --git a/megatron/core/transformer/utils.py b/megatron/core/transformer/utils.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
 
 """Utilities for transformer layers."""
-from functools import lru_cache
 from operator import itemgetter
 from typing import TYPE_CHECKING, Any, Dict, Iterable, Optional, Tuple, Union
 
@@ -29,13 +28,11 @@ def get_linear_layer(rows, columns, init_method, perform_initialization=True):
     return layer
 
 
-@lru_cache(maxsize=32)
 def get_default_causal_mask(sq: int) -> torch.Tensor:
     """Return the causal upper triangular mask for softmax input."""
     return torch.triu(torch.ones(sq, sq, device="cuda"), diagonal=1).bool()
 
 
-@lru_cache(maxsize=32)
 def get_sliding_window_causal_mask(sq, skv, window_size):
     """Create the equivalent attention mask for SWA in [sq, skv] shape"""
     m = torch.ones(sq, skv, dtype=torch.bool, device="cuda")
diff --git a/tests/unit_tests/fusions/test_torch_softmax.py b/tests/unit_tests/fusions/test_torch_softmax.py
@@ -1,3 +1,5 @@
+# Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
 import pytest
 import torch
 
@@ -21,9 +23,6 @@ def setup_method(self, method):
             scale=None,
         )
 
-    def teardown_method(self):
-        get_default_causal_mask.cache_clear()
-
     def test_output_shape(self):
         x = torch.randn(8, 2, 4, 4, device="cuda")
         y = self.softmax(x, None, None)
@@ -126,9 +125,6 @@ def test_causal_mask_equal_scores(self):
 class TestFusedScaleMaskSoftmaxComprehensive:
     """Comprehensive tests for FusedScaleMaskSoftmax including window attention and scaling."""
 
-    def teardown_method(self):
-        get_default_causal_mask.cache_clear()
-
     def test_scaling_factor(self):
         """Test softmax with different scaling factors."""
         x = torch.randn(2, 4, 8, 8, device="cuda")