Skip to content

Commit ecf173d

Browse files
committed
Merge branch 'main' into robinz/refactor_cuda_graph_scope
2 parents a35489f + 6c8cdd5 commit ecf173d

File tree

98 files changed

+5683
-1146
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

98 files changed

+5683
-1146
lines changed
Lines changed: 155 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,155 @@
1+
name: API Compatibility Check
2+
3+
on:
4+
push:
5+
branches:
6+
- "pull-request/[0-9]+"
7+
paths:
8+
- 'megatron/core/**/*.py'
9+
- '!megatron/core/tests/**'
10+
- '!megatron/legacy/**'
11+
12+
# Allow manual trigger
13+
workflow_dispatch:
14+
inputs:
15+
baseline:
16+
description: 'Baseline git reference (tag/branch)'
17+
required: true
18+
19+
jobs:
20+
check-compatibility:
21+
name: Check API Backward Compatibility
22+
runs-on: ubuntu-latest
23+
24+
steps:
25+
- name: Checkout code
26+
uses: actions/checkout@v4
27+
with:
28+
fetch-depth: 0 # Need full history to access baseline ref
29+
30+
- name: Set up Python
31+
uses: actions/setup-python@v5
32+
with:
33+
python-version: '3.12'
34+
35+
- name: Install griffe
36+
run: |
37+
python -m pip install --upgrade pip
38+
python -m pip install griffe
39+
python -c "import griffe; print('Griffe installed successfully')"
40+
python -c "from griffe import Object; print('Object import successful')" || echo "Object import from griffe failed"
41+
python -c "from griffe.dataclasses import Object; print('Object import from dataclasses successful')" || echo "Object import from dataclasses failed"
42+
43+
- name: Determine baseline reference
44+
id: baseline
45+
run: |
46+
if [ "${{ github.event_name }}" == "workflow_dispatch" ]; then
47+
# Use manually specified baseline
48+
BASELINE="${{ github.event.inputs.baseline }}"
49+
else
50+
# Auto-detect latest core release tag (stable versions only, no rc)
51+
BASELINE=$(git tag -l 'core_v*' | grep -E '^core_v[0-9]+\.[0-9]+\.[0-9]+$' | sort -V | tail -1)
52+
53+
# Error out if no tags found
54+
if [ -z "$BASELINE" ]; then
55+
echo "Error: No core_v* release tags found. Cannot determine baseline." >&2
56+
exit 1
57+
fi
58+
fi
59+
60+
echo "baseline=$BASELINE" >> $GITHUB_OUTPUT
61+
echo "Using baseline: $BASELINE"
62+
63+
- name: Run compatibility check
64+
id: compat_check
65+
run: |
66+
# Save output to file for later display
67+
python scripts/check_api_backwards_compatibility.py \
68+
--baseline ${{ steps.baseline.outputs.baseline }} \
69+
--verbose 2>&1 | tee compat_check_output.txt
70+
71+
# Capture exit code
72+
EXIT_CODE=${PIPESTATUS[0]}
73+
echo "exit_code=$EXIT_CODE" >> $GITHUB_OUTPUT
74+
exit $EXIT_CODE
75+
continue-on-error: true
76+
77+
- name: Fail job if breaking changes detected
78+
if: steps.compat_check.outcome == 'failure'
79+
run: |
80+
echo "::group::❌ API Backward Compatibility Check Failed"
81+
echo ""
82+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
83+
echo "🔍 WHAT IS THIS CHECK?"
84+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
85+
echo ""
86+
echo "This check ensures that changes to Megatron Core's public API do not"
87+
echo "break backward compatibility for users. It compares your PR against"
88+
echo "the latest stable release to detect breaking changes in:"
89+
echo ""
90+
echo " • Function signatures (parameters, order, types)"
91+
echo " • Class structures and methods"
92+
echo " • Return types and public interfaces"
93+
echo ""
94+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
95+
echo "🛠️ HOW TO FIX THIS"
96+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
97+
echo ""
98+
echo "Choose ONE of these resolution strategies:"
99+
echo ""
100+
echo "1️⃣ REVERT THE BREAKING CHANGE (Recommended)"
101+
echo " → Modify your code to preserve backward compatibility"
102+
echo " → Add new parameters as optional (with defaults)"
103+
echo " → Keep existing parameters in the same order"
104+
echo ""
105+
echo "2️⃣ MARK AS INTERNAL API (If this is internal code)"
106+
echo " → Add @internal_api decorator from megatron.core.utils"
107+
echo ""
108+
echo " Example (for classes):"
109+
echo " from megatron.core.utils import internal_api"
110+
echo ""
111+
echo " @internal_api"
112+
echo " class ExperimentalFeature:"
113+
echo " pass"
114+
echo ""
115+
echo " Example (for functions):"
116+
echo " from megatron.core.utils import internal_api"
117+
echo ""
118+
echo " @internal_api"
119+
echo " def internal_helper_function():"
120+
echo " pass"
121+
echo ""
122+
echo "3️⃣ USE DEPRECATION (For gradual API changes)"
123+
echo " → Add @deprecated decorator for transition period"
124+
echo " → Example:"
125+
echo " from megatron.core.utils import deprecated"
126+
echo ""
127+
echo " @deprecated(version='1.0', removal_version='2.0',"
128+
echo " alternative='new_function')"
129+
echo " def old_function():"
130+
echo " pass"
131+
echo ""
132+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
133+
echo "📋 BREAKING CHANGES DETECTED"
134+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
135+
echo ""
136+
cat compat_check_output.txt
137+
echo ""
138+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
139+
echo "📚 MORE INFORMATION"
140+
echo "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━"
141+
echo ""
142+
echo "📖 Full documentation: docs/api-backwards-compatibility-check.md"
143+
echo "🔧 Checker script: scripts/check_api_backwards_compatibility.py"
144+
echo "❓ Questions? Check the docs or ask in #megatron-core"
145+
echo ""
146+
echo "::endgroup::"
147+
148+
echo "::error::Breaking API changes detected. Please review the output above and choose a resolution strategy."
149+
exit 1
150+
151+
- name: Success message
152+
if: steps.compat_check.outcome == 'success'
153+
run: |
154+
echo "::notice::✅ No breaking API changes detected!"
155+

.github/workflows/install-test.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ jobs:
149149
env:
150150
GH_TOKEN: ${{ github.token }}
151151
RUN_ID: ${{ github.run_id }}
152-
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
152+
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
153153
run: |
154154
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
155155

README.md

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ pip install --no-build-isolation .[mlm,dev]
9999

100100
```
101101
Megatron-LM/
102-
├── megatron/
102+
├── megatron/
103103
│ ├── core/ # Megatron Core (kernels, parallelism, building blocks)
104104
│ │ ├── models/ # Transformer models
105105
│ │ ├── transformer/ # Transformer building blocks
@@ -128,7 +128,7 @@ Megatron-LM/
128128

129129
- **Training state-of-the-art foundation models** at scale with cutting-edge performance on latest NVIDIA hardware
130130
- **Research teams** exploring new architectures and training techniques
131-
- **Learning distributed training** concepts and best practices
131+
- **Learning distributed training** concepts and best practices
132132
- **Quick experimentation** with proven model configurations
133133

134134
**What you get:**
@@ -137,7 +137,7 @@ Megatron-LM/
137137
- End-to-end examples from data prep to evaluation
138138
- Research-focused tools and utilities
139139

140-
### Megatron Core: Composable Library
140+
### Megatron Core: Composable Library
141141

142142
**Composable library** with GPU-optimized building blocks for custom training frameworks.
143143

@@ -170,7 +170,7 @@ Megatron-LM/
170170
- **[Megatron Bridge](https://github.com/NVIDIA-NeMo/Megatron-Bridge)** - Training library with bidirectional Hugging Face ↔ Megatron checkpoint conversion, flexible training loops, and production-ready recipes
171171
- **[NeMo RL](https://github.com/NVIDIA-NeMo/RL)** - Scalable toolkit for efficient reinforcement learning with RLHF, DPO, and other post-training methods
172172
- **[NeMo Framework](https://docs.nvidia.com/nemo-framework/user-guide/latest/overview.html)** - Enterprise framework with cloud-native support and end-to-end examples
173-
- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, and distillation
173+
- **[TensorRT Model Optimizer (ModelOpt)](https://github.com/NVIDIA/TensorRT-Model-Optimizer)** - Model optimization toolkit for quantization, pruning, distillation, speculative decoding, and more. Checkout end-to-end examples in [examples/post_training/modelopt](./examples/post_training/modelopt/).
174174

175175
**Compatible with:** [Hugging Face Accelerate](https://github.com/huggingface/accelerate), [Colossal-AI](https://github.com/hpcaitech/ColossalAI), [DeepSpeed](https://github.com/microsoft/DeepSpeed)
176176

@@ -257,7 +257,7 @@ Our codebase efficiently trains models from 2B to 462B parameters across thousan
257257
**Benchmark Configuration:**
258258

259259
- **Vocabulary size**: 131,072 tokens
260-
- **Sequence length**: 4096 tokens
260+
- **Sequence length**: 4096 tokens
261261
- **Model scaling**: Varied hidden size, attention heads, and layers to achieve target parameter counts
262262
- **Communication optimizations**: Fine-grained overlapping with DP (`--overlap-grad-reduce`, `--overlap-param-gather`), TP (`--tp-comm-overlap`), and PP (enabled by default)
263263

docker/Dockerfile.ci.dev

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,9 @@ RUN bash -ex <<"EOF"
5959
ln -s libnvshmem_host.so.3 libnvshmem_host.so
6060
popd
6161

62-
git clone https://github.com/deepseek-ai/DeepEP.git
62+
git clone --branch tongliu/inter_node https://github.com/Autumn1998/DeepEP.git
6363
pushd DeepEP
64-
git checkout ef73fd9a4c098e09929e48dfd696505ddc8e2043
64+
git checkout 0fa8568c5923fcfc87f49ef0c3761dc013375a67
6565
patch -p1 < /workspace/deepep.patch
6666
popd
6767
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" uv pip install --no-build-isolation -v DeepEP/.

0 commit comments

Comments
 (0)