Skip to content

Commit 3e308ac

Browse files
authored
Merge branch 'main' into denliu/deepep_shared_expert_overlap
2 parents b120ef9 + 0634924 commit 3e308ac

File tree

156 files changed

+8882
-2951
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

156 files changed

+8882
-2951
lines changed

.github/CODEOWNERS

Lines changed: 19 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,52 +1,48 @@
11
megatron/core/ @NVIDIA/core-adlr @NVIDIA/core-nemo
22

3-
megatron/core/models/gpt/ @NVIDIA/gpt
3+
megatron/core/models/gpt/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/gpt
44

5-
megatron/core/models/multimodal/ @NVIDIA/multi-modal
5+
megatron/core/models/multimodal/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/multi-modal
66

7-
megatron/core/models/mamba/ @NVIDIA/hybrid-mamba
7+
megatron/core/models/mamba/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
8+
megatron/core/ssm/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/hybrid-mamba
89

9-
megatron/core/datasets/ @NVIDIA/datasets
10+
megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
1011

11-
megatron/core/distributed/fsdp/ @NVIDIA/megatron-fsdp
12+
megatron/core/distributed/fsdp/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
1213

13-
megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/megatron-fsdp
14+
megatron/core/transformer/fsdp_dtensor_checkpoint.py @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/megatron-fsdp
1415

15-
megatron/core/dist_checkpointing/ @NVIDIA/dist-checkpointing
16+
megatron/core/dist_checkpointing/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-checkpointing
1617

17-
megatron/core/optimizer/distrib_optimizer/ @NVIDIA/dist-optimizer
18+
megatron/core/optimizer/distrib_optimizer/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/dist-optimizer
1819

19-
megatron/core/inference/modelopt_support @NVIDIA/quantization-and-inference
20+
megatron/core/inference/modelopt_support @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/quantization-and-inference
2021

21-
megatron/core/datasets/ @NVIDIA/datasets
22+
megatron/core/datasets/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/datasets
2223

23-
megatron/core/pipeline_parallel/ @NVIDIA/pipeline-parallelism
24+
megatron/core/pipeline_parallel/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/pipeline-parallelism
2425

2526
megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo
2627

27-
megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-devtech
28+
megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/mixture-of-experts-adlr @NVIDIA/mixture-of-experts-devtech
2829

29-
megatron/core/inference/ @NVIDIA/inference
30+
megatron/core/inference/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/inference
3031

31-
megatron/core/parallel_state.py @NVIDIA/core-nemo
32+
megatron/core/parallel_state.py @NVIDIA/core-adlr @NVIDIA/core-nemo
3233

33-
megatron/core/post_training/ @NVIDIA/post-training
34+
megatron/core/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
3435

35-
megatron/post_training/ @NVIDIA/post-training
36+
megatron/post_training/ @NVIDIA/core-adlr @NVIDIA/core-nemo @NVIDIA/post-training
3637

3738
.gitlab/ @NVIDIA/ci
3839
.github/ @NVIDIA/ci
3940
.gitlab-ci.yml @NVIDIA/ci
4041
docker/ @NVIDIA/ci
41-
tests/unit_tests/run_ci_test.sh @NVIDIA/ci
42-
tests/test_utils/python_scripts/
4342
tests/functional_tests/python_test_utils/ @NVIDIA/ci
4443
tests/functional_tests/shell_test_utils/ @NVIDIA/ci
45-
megatron/core/transformer/transformer_block.py @NVIDIA/ci
46-
megatron/core/transformer/transformer_layer.py @NVIDIA/ci
47-
tests/functional_tests/test_cases/ @NVIDIA/ci
48-
tests/functional_tests/recipes/ @NVIDIA/ci
49-
tests/unit_tests/ @NVIDIA/ci
44+
tests/test_utils/recipes/ @NVIDIA/ci
45+
tests/unit_tests/run_ci_test.sh @NVIDIA/ci
5046

5147
megatron/rl/ @NVIDIA/reinforcement-learning
5248
examples/rl/ @NVIDIA/reinforcement-learning

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "guyueh1", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yeyu-nvidia", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]

.github/workflows/_build_test_publish_wheel.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ jobs:
129129
publish-wheels:
130130
needs: [build-and-test-wheels]
131131
runs-on: ubuntu-latest
132+
if: github.ref == 'refs/heads/main'
132133
environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/heads/r')) && 'main' || 'public' }}
133134
strategy:
134135
fail-fast: false

.github/workflows/auto-update-copy-pr-bot.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,10 @@ jobs:
4848
mv .github/copy-pr-bot.yaml.new .github/copy-pr-bot.yaml
4949
5050
- name: Commit changes
51+
env:
52+
GH_TOKEN: ${{ secrets.PAT }}
5153
run: |
52-
git remote set-url origin https://x-access-token:${{ secrets.PAT }}@github.com/NVIDIA/Megatron-LM.git
54+
git remote set-url origin https://x-access-token:${GH_TOKEN}@github.com/NVIDIA/Megatron-LM.git
5355
git config --global user.name "GitHub Actions"
5456
git config --global user.email "github-actions[bot]@users.noreply.github.com"
5557
git add .github/copy-pr-bot.yaml
@@ -58,4 +60,4 @@ jobs:
5860
exit 0
5961
fi
6062
git commit -m "Update copy-pr-bot.yaml [skip ci]"
61-
git push
63+
git push -u origin main

.github/workflows/build-test-publish-wheel.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ on:
1818
push:
1919
branches:
2020
- main
21-
- "pull-request/[0-9]+"
22-
- "deploy-release/*"
21+
- 'pull-request/[0-9]+'
22+
- 'deploy-release/*'
2323
merge_group:
2424
types: [checks_requested]
2525

@@ -63,7 +63,7 @@ jobs:
6363
env:
6464
GH_TOKEN: ${{ github.token }}
6565
GITHUB_RUN_ID: ${{ github.run_id }}
66-
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
66+
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' || github.ref != 'refs/heads/main' }}
6767
run: |
6868
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
6969
Lines changed: 171 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,171 @@
1+
name: API Compatibility Check
2+
3+
on:
4+
push:
5+
branches:
6+
- "pull-request/[0-9]+"
7+
paths:
8+
- 'megatron/core/**/*.py'
9+
- '!megatron/core/tests/**'
10+
- '!megatron/legacy/**'
11+
12+
# Allow manual trigger
13+
workflow_dispatch:
14+
inputs:
15+
baseline:
16+
description: 'Baseline git reference (tag/branch/commit)'
17+
required: true
18+
19+
jobs:
20+
check-compatibility:
21+
name: Check API Backward Compatibility
22+
runs-on: ubuntu-latest
23+
24+
# ============================================================================
25+
# Configuration Parameters (modify here)
26+
# ============================================================================
27+
env:
28+
# Default baseline for automatic PR checks
29+
# Can be: branch name (e.g., 'main'), commit hash, or tag
30+
# Will be resolved to commit hash during execution
31+
DEFAULT_BASELINE: '712dff880cdf88e51289ad71e47d92f46d25a2d3'
32+
# Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*')
33+
TAG_PATTERN: 'core_v*'
34+
# Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only)
35+
TAG_REGEX_FILTER: '^core_v[0-9]+\.[0-9]+\.[0-9]+$'
36+
# ============================================================================
37+
38+
steps:
39+
- name: Checkout code
40+
uses: actions/checkout@v4
41+
with:
42+
fetch-depth: 0 # Need full history to access baseline ref
43+
44+
- name: Set up Python
45+
uses: actions/setup-python@v5
46+
with:
47+
python-version: '3.12'
48+
49+
- name: Install griffe
50+
run: |
51+
python -m pip install --upgrade pip
52+
python -m pip install griffe
53+
python -c "import griffe; print('Griffe installed successfully')"
54+
python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed"
55+
python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed"
56+
57+
- name: Determine baseline reference
58+
id: baseline
59+
run: |
60+
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
61+
# Use manually specified baseline (branch, tag, or commit hash)
62+
BASELINE_REF="${{ github.event.inputs.baseline }}"
63+
else
64+
# Use the configured default baseline
65+
BASELINE_REF="${{ env.DEFAULT_BASELINE }}"
66+
67+
# Uncomment below to auto-detect from tags instead:
68+
# BASELINE_REF=$(git tag -l '${{ env.TAG_PATTERN }}' | grep -E '${{ env.TAG_REGEX_FILTER }}' | sort -V | tail -1)
69+
# if [ -z "$BASELINE_REF" ]; then
70+
# echo "Warning: No tags matching pattern found. Using default: ${{ env.DEFAULT_BASELINE }}" >&2
71+
# BASELINE_REF="${{ env.DEFAULT_BASELINE }}"
72+
# fi
73+
fi
74+
75+
# Resolve baseline to commit hash (works for branches, tags, or commit hashes)
76+
BASELINE_HASH=$(git rev-parse "$BASELINE_REF")
77+
78+
echo "baseline=$BASELINE_HASH" >> $GITHUB_OUTPUT
79+
echo "Using baseline: $BASELINE_REF (resolved to commit: $BASELINE_HASH)"
80+
81+
- name: Run compatibility check
82+
id: compat_check
83+
run: |
84+
# Save output to file for later display
85+
python scripts/check_api_backwards_compatibility.py \
86+
--baseline ${{ steps.baseline.outputs.baseline }} \
87+
--verbose 2>&1 | tee compat_check_output.txt
88+
89+
# Capture exit code
90+
EXIT_CODE=${PIPESTATUS[0]}
91+
echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
92+
exit $EXIT_CODE
93+
continue-on-error: true
94+
95+
- name: Fail job if breaking changes detected
96+
if: steps.compat_check.outcome == 'failure'
97+
run: |
98+
echo ""
99+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
100+
echo "🔍 WHAT IS THIS CHECK?"
101+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
102+
echo ""
103+
echo "This check ensures that changes to Megatron Core's public API do not"
104+
echo "break backward compatibility for users. It compares your PR against"
105+
echo "the latest stable release to detect breaking changes in:"
106+
echo ""
107+
echo " • Function signatures (parameters, order, types)"
108+
echo " • Class structures and methods"
109+
echo " • Return types and public interfaces"
110+
echo ""
111+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
112+
echo "🛠️ HOW TO FIX THIS"
113+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
114+
echo ""
115+
echo "Choose ONE of these resolution strategies:"
116+
echo ""
117+
echo "1️⃣ REVERT THE BREAKING CHANGE (Recommended)"
118+
echo " → Modify your code to preserve backward compatibility"
119+
echo " → Add new parameters as optional (with defaults)"
120+
echo " → Keep existing parameters in the same order"
121+
echo ""
122+
echo "2️⃣ MARK AS INTERNAL API (If this is internal code)"
123+
echo " → Add @internal_api decorator from megatron.core.utils"
124+
echo ""
125+
echo " Example (for classes):"
126+
echo " from megatron.core.utils import internal_api"
127+
echo ""
128+
echo " @internal_api"
129+
echo " class ExperimentalFeature:"
130+
echo " pass"
131+
echo ""
132+
echo " Example (for functions):"
133+
echo " from megatron.core.utils import internal_api"
134+
echo ""
135+
echo " @internal_api"
136+
echo " def internal_helper_function():"
137+
echo " pass"
138+
echo ""
139+
echo "3️⃣ USE DEPRECATION (For gradual API changes)"
140+
echo " → Add @deprecated decorator for transition period"
141+
echo " → Example:"
142+
echo " from megatron.core.utils import deprecated"
143+
echo ""
144+
echo " @deprecated(version='1.0', removal_version='2.0',"
145+
echo " alternative='new_function')"
146+
echo " def old_function():"
147+
echo " pass"
148+
echo ""
149+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
150+
echo "📋 BREAKING CHANGES DETECTED"
151+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
152+
echo ""
153+
cat compat_check_output.txt
154+
echo ""
155+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
156+
echo "📚 MORE INFORMATION"
157+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
158+
echo ""
159+
echo "📖 Full documentation: docs/api-backwards-compatibility-check.md"
160+
echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py"
161+
echo "❓ Questions? Check the docs or ask in #megatron-core"
162+
echo ""
163+
164+
echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy."
165+
exit 1
166+
167+
- name: Success message
168+
if: steps.compat_check.outcome == 'success'
169+
run: |
170+
echo "::notice::✅ No breaking API changes detected!"
171+

.github/workflows/install-test.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ on:
2222
branches:
2323
- dev
2424
- main
25-
- "pull-request/[0-9]+"
26-
- "deploy-release/*"
25+
- 'pull-request/[0-9]+'
26+
- 'deploy-release/*'
2727
merge_group:
2828
types: [checks_requested]
2929

@@ -47,7 +47,7 @@ jobs:
4747
strategy:
4848
fail-fast: false
4949
matrix:
50-
python-version: ["3.12"]
50+
python-version: ['3.12']
5151
steps:
5252
- name: Checkout repository
5353
uses: actions/checkout@v4
@@ -93,7 +93,7 @@ jobs:
9393
strategy:
9494
fail-fast: false
9595
matrix:
96-
python-version: ["3.12"]
96+
python-version: ['3.12']
9797
steps:
9898
- name: Checkout repository
9999
uses: actions/checkout@v4
@@ -140,13 +140,16 @@ jobs:
140140
&& !cancelled()
141141
&& github.repository == 'NVIDIA/Megatron-LM'
142142
steps:
143+
- name: Checkout
144+
uses: actions/checkout@v4
145+
143146
- name: Get workflow result
144147
id: result
145148
shell: bash -x -e -u -o pipefail {0}
146149
env:
147150
GH_TOKEN: ${{ github.token }}
148151
RUN_ID: ${{ github.run_id }}
149-
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
152+
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
150153
run: |
151154
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
152155

LICENSE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
The following applies to all files unless otherwise noted:
22

3-
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
4-
#
3+
# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
54
# Redistribution and use in source and binary forms, with or without
65
# modification, are permitted provided that the following conditions
76
# are met:

0 commit comments

Comments
 (0)