ADLR/megatron-lm!4169 - [OMNIML-2921] GPT-OSS Modelopt support

yueshen2016 · Chen-Han Yu · jaredcasper · commit a2d8c806b35b · 2025-10-21T06:35:56.000-07:00
Co-authored-by: Chen-Han Yu &lt;chenhany@cw-dfw-cs-001-login-01.cm.cluster&gt;
diff --git a/examples/post_training/modelopt/conf/openai/gpt-oss-120b.sh b/examples/post_training/modelopt/conf/openai/gpt-oss-120b.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=openai/gpt-oss-20b
+    TOKENIZER_MODEL=openai/gpt-oss-20b
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+# WAR: enable-gpt-oss is a temporary workaround for using the default GPT-OSS config
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --untie-embeddings-and-output-weights \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --num-layers 36 \
+    --hidden-size 2880 \
+    --ffn-hidden-size 2880 \
+    --num-attention-heads 64 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --kv-channels 64 \
+    --num-experts 128 \
+    --moe-ffn-hidden-size 2880 \
+    --moe-router-dtype fp32 \
+    --moe-router-topk 4 \
+    --moe-aux-loss-coeff 0.0 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-score-function softmax \
+    --moe-router-load-balancing-type aux_loss \
+    --seq-length 4096 \
+    --max-position-embeddings 40960 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 128 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rotary-base 150000 \
+    --no-bias-gelu-fusion \
+    --sequence-parallel \
+    --export-force-local-attention \
+    --no-bias-dropout-fusion \
+    --padded-vocab-size 201088 \
+    --quick-geglu \
+    --glu-linear-offset 1.0 \
+    --softmax-type learnable \
+    --window-attn-skip-freq 2 \
+    --enable-gpt-oss \
+    --activation-func-clamp-value 7.0 \
+    --window-size 128,0 \
+"
diff --git a/examples/post_training/modelopt/conf/openai/gpt-oss-20b.sh b/examples/post_training/modelopt/conf/openai/gpt-oss-20b.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+if [ -z ${HF_MODEL_CKPT} ]; then
+    HF_MODEL_CKPT=openai/gpt-oss-20b
+    TOKENIZER_MODEL=openai/gpt-oss-20b
+else
+    TOKENIZER_MODEL=${HF_MODEL_CKPT}
+fi
+
+# WAR: enable-gpt-oss is a temporary workaround for using the default GPT-OSS config
+MODEL_ARGS=" \
+    --save-interval 100000 \
+    --micro-batch-size 1 \
+    --bf16 \
+    --no-masked-softmax-fusion \
+    --untie-embeddings-and-output-weights \
+    --no-rope-fusion \
+    --normalization RMSNorm \
+    --num-layers 24 \
+    --hidden-size 2880 \
+    --ffn-hidden-size 2880 \
+    --num-attention-heads 64 \
+    --group-query-attention \
+    --num-query-groups 8 \
+    --kv-channels 64 \
+    --num-experts 32 \
+    --moe-ffn-hidden-size 2880 \
+    --moe-router-dtype fp32 \
+    --moe-router-topk 4 \
+    --moe-aux-loss-coeff 0.0 \
+    --moe-token-dispatcher-type alltoall \
+    --moe-router-score-function softmax \
+    --moe-router-load-balancing-type aux_loss \
+    --seq-length 4096 \
+    --max-position-embeddings 40960 \
+    --tokenizer-type HuggingFaceTokenizer \
+    --make-vocab-size-divisible-by 128 \
+    --use-mcore-models \
+    --rotary-percent 1.0 \
+    --rotary-base 150000 \
+    --no-bias-gelu-fusion \
+    --sequence-parallel \
+    --export-force-local-attention \
+    --no-bias-dropout-fusion \
+    --padded-vocab-size 201088 \
+    --quick-geglu \
+    --glu-linear-offset 1.0 \
+    --softmax-type learnable \
+    --window-attn-skip-freq 2 \
+    --enable-gpt-oss \
+    --activation-func-clamp-value 7.0 \
+    --window-size 128,0 \
+"
diff --git a/examples/post_training/modelopt/generate.py b/examples/post_training/modelopt/generate.py
@@ -150,9 +150,10 @@ def get_conversations(example):
                 input_ids = tokenizer.apply_chat_template(
                     new_conversations, return_tensors="pt", add_generation_prompt=True
                 )
-                output_ids = simple_generate(
-                    unwrapped_model, input_ids.cuda(), osl=args.osl, disable_tqdm=args.disable_tqdm
-                )
+                with torch.no_grad():
+                    output_ids = simple_generate(
+                        unwrapped_model, input_ids.cuda(), osl=args.osl, disable_tqdm=args.disable_tqdm
+                    )
                 output_texts = tokenizer.batch_decode(output_ids)[0]
                 print_rank_0("{}".format(output_texts))
                 new_conversations.append({"role": "assistant", "content": output_texts})
diff --git a/examples/post_training/modelopt/mmlu.py b/examples/post_training/modelopt/mmlu.py
@@ -28,6 +28,7 @@ def add_mmlu_args(parser):
     group.add_argument("--disable-tqdm", action="store_true", help="Disable tqdm.")
     group.add_argument("--fraction", type=float, default=1.0, help="Fraction of dataset to use.")
     group.add_argument("--lower-bound", type=float, default=None)
+    group.add_argument("--no-subject-prompt", action="store_true", help="Use empty prompt instead of subject-based prompt.")
     add_modelopt_args(parser)
     return parser
 
@@ -101,17 +102,20 @@ def format_example(example, include_answer: bool = True):
     for choice, answer in zip(["A", "B", "C", "D"], example["choices"]):
         prompt += "\n{}. {}".format(choice, answer)
     if include_answer:
-        prompt += "Answer: {}\n\n".format(example["answer"])
+        prompt += "\nAnswer: {}\n\n".format(["A", "B", "C", "D"][example["answer"]])
     else:
         prompt += "\nAnswer:"
     return prompt
 
 
-def generate_prompt(test_example, dev_examples, few_shots=0):
+def generate_prompt(test_example, dev_examples, few_shots=0, no_subject_prompt=False):
     """Generating few-shot prompts."""
-    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
-        " ".join(test_example["subject"].split("_"))
-    )
+    if no_subject_prompt:
+        prompt = ""
+    else:
+        prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
+            " ".join(test_example["subject"].split("_"))
+        )
     for i in range(few_shots):
         prompt += format_example(dev_examples[i])
     prompt += format_example(test_example, include_answer=False)
@@ -147,11 +151,6 @@ def generate_prompt(test_example, dev_examples, few_shots=0):
     model = get_model(functools.partial(model_provider, parallel_output=True), wrap_with_ddp=False)
     report_current_memory_info()
 
-    # Materialize the model from meta device to gpu before loading the checkpoint.
-    unwrapped_model = unwrap_model(model)[0]
-    unwrapped_model.to_empty(device="cuda")
-    report_current_memory_info()
-
     disable_tqdm = args.disable_tqdm or torch.distributed.get_rank() > 0
 
     tokenizer = get_tokenizer()._tokenizer
@@ -160,6 +159,9 @@ def generate_prompt(test_example, dev_examples, few_shots=0):
         load_modelopt_checkpoint(model, strict=not args.untie_embeddings_and_output_weights)
         print_rank_0("Done loading checkpoint")
 
+    unwrapped_model = unwrap_model(model)[0]
+    unwrapped_model.eval()
+
     all_subjects = get_all_subjects()
 
     all_correct = {}
@@ -172,12 +174,13 @@ def generate_prompt(test_example, dev_examples, few_shots=0):
         for idx, test_example in enumerate(test_data):
             if idx > args.fraction * len(test_data):
                 break
-            prompt = generate_prompt(test_example, dev_data, few_shots=0)
+            prompt = generate_prompt(test_example, dev_data, few_shots=0, no_subject_prompt=args.no_subject_prompt)
             label = ["A", "B", "C", "D"][test_example["answer"]]
             tokens = tokenizer(prompt, return_tensors="pt")
-            generated_ids = simple_generate(
-                unwrapped_model, tokens.input_ids.cuda(), osl=2, disable_tqdm=disable_tqdm
-            )
+            with torch.no_grad():
+                generated_ids = simple_generate(
+                    unwrapped_model, tokens.input_ids.cuda(), osl=2, disable_tqdm=disable_tqdm
+                )
             predict = tokenizer.batch_decode(generated_ids)[0].strip()
             correct += [True] if predict.startswith(label) else [False]
         all_correct[subject] = correct
diff --git a/megatron/post_training/arguments.py b/megatron/post_training/arguments.py
@@ -122,5 +122,13 @@ def add_modelopt_args(parser):
         action="store_true",
         help='Will be set automatically when loading a ModelOpt checkpoint.',
     )
+    
+    # GPT-OSS YaRN RoPE support
+    group.add_argument(
+        '--enable-gpt-oss',
+        action="store_true",
+        help='Enable GPT-OSS mode with YaRN RoPE configuration. When enabled, automatically '
+             'configures all YaRN parameters with GPT-OSS defaults.',
+    )
 
     return parser
diff --git a/megatron/post_training/model_provider.py b/megatron/post_training/model_provider.py
@@ -151,6 +151,21 @@ def model_provider(pre_process=True, post_process=True, parallel_output=True) ->
     # ModelOpt by default assumes none homogenous layers. This affect the storage format of the sharded checkpoint.
     config = core_transformer_config_from_args(args)
 
+    # Handle GPT-OSS mode with YaRN RoPE configuration
+    if hasattr(args, 'enable_gpt_oss') and args.enable_gpt_oss:
+        print_rank_0("GPT-OSS mode enabled: Configuring YaRN RoPE parameters")
+
+        # Set GPT-OSS YaRN values directly on the config
+        # These defaults are based on Huggingface GPT-OSS configurations
+        config.position_embedding_type = "yarn"
+        config.yarn_rotary_scaling_factor = 32.0
+        config.yarn_original_max_position_embeddings = 131072
+        config.yarn_beta_fast = 32.0
+        config.yarn_beta_slow = 1.0
+        config.yarn_mscale = 1.0
+        config.yarn_mscale_all_dim = 0.0
+        config.yarn_correction_range_round_to_int = False
+
     if args.use_legacy_models:
         raise ValueError(
             "ModelOpt integration only support MCore models. Use --use-mcore-modules instead."