Skip to content

Commit 4f52e0c

Browse files
authored
Merge branch 'dev' into fix_mfsdp_random_hang
2 parents 138ee3c + a4fce1d commit 4f52e0c

File tree

191 files changed

+59997
-2680
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

191 files changed

+59997
-2680
lines changed

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mkhona-nvidia", "pablo-garay", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]

.github/workflows/auto-update-copy-pr-bot.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ jobs:
99
auto-update-copy-pr-bot:
1010
runs-on: ubuntu-latest
1111
environment: nemo-ci
12+
if: github.repository == 'NVIDIA/Megatron-LM'
1213
steps:
1314
- name: Checkout code
1415
uses: actions/checkout@v3

.github/workflows/cicd-main.yml

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,26 @@ jobs:
233233
with:
234234
python-version: 3.12
235235

236+
- name: Install GH CLI
237+
shell: bash -x -e -u -o pipefail {0}
238+
run: |
239+
apt-get update
240+
apt-get install -y gh
241+
236242
- name: Get PR info
237243
id: get-pr-info
238244
if: startsWith(github.ref, 'refs/heads/pull-request/')
239245
uses: nv-gha-runners/get-pr-info@main
240246

247+
- name: Has lts label
248+
id: has-lts-label
249+
env:
250+
GH_TOKEN: ${{ secrets.PAT }}
251+
run: |
252+
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
253+
HAS_LTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "container::lts")') || echo "false"
254+
echo "main=$HAS_LTS_LABEL" | tee -a $GITHUB_OUTPUT
255+
241256
- name: Download test data
242257
shell: bash
243258
env:
@@ -276,6 +291,22 @@ jobs:
276291
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
277292
echo "EOF" | tee -a $GITHUB_OUTPUT
278293
294+
- name: Parse baseimage
295+
shell: bash
296+
id: base-image
297+
env:
298+
HAS_LTS_LABEL: ${{ steps.has-lts-label.outputs.main }}
299+
run: |
300+
if [ "$HAS_LTS_LABEL" == "true" ]; then
301+
NGC_VERSION=$(cat docker/.ngc_version.lts)
302+
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
303+
echo "image_type=lts" | tee -a $GITHUB_OUTPUT
304+
else
305+
NGC_VERSION=$(cat docker/.ngc_version.dev)
306+
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
307+
echo "image_type=dev" | tee -a $GITHUB_OUTPUT
308+
fi
309+
279310
- name: Set up Docker Buildx
280311
uses: docker/setup-buildx-action@v3
281312

@@ -287,7 +318,8 @@ jobs:
287318
context: .
288319
target: main
289320
build-args: |
290-
FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3
321+
FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
322+
IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
291323
cache-from: |
292324
type=registry,ref=${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
293325
type=registry,ref=${{ env.container-registry }}/megatron-lm:main-buildcache,mode=max

.github/workflows/copyright-check.yml

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,8 @@ name: Copyright check
1717
on:
1818
push:
1919
branches:
20-
- dev
21-
- main
22-
- "pull-request/[0-9]+"
23-
- "deploy-release/*"
20+
- 'pull-request/[0-9]+'
21+
- 'deploy-release/*'
2422
merge_group:
2523
types: [checks_requested]
2624

@@ -36,7 +34,7 @@ jobs:
3634
|| needs.pre-flight.outputs.is_merge_group == 'true'
3735
|| needs.pre-flight.outputs.is_deployment_workflow == 'true')
3836
&& github.repository == 'NVIDIA/Megatron-LM'
39-
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected].13
37+
uses: NVIDIA-NeMo/FW-CI-templates/.github/workflows/[email protected].12
4038

4139
copyright-check-summary:
4240
needs: [pre-flight, copyright-check]
@@ -50,13 +48,8 @@ jobs:
5048
&& github.repository == 'NVIDIA/Megatron-LM'
5149
runs-on: ubuntu-latest
5250
steps:
53-
- name: Checkout repository
54-
uses: actions/checkout@v4
55-
5651
- name: Result
5752
env:
58-
GH_TOKEN: ${{ github.token }}
59-
GITHUB_RUN_ID: ${{ github.run_id }}
6053
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' || needs.pre-flight.outputs.is_ci_workload == 'true' }}
6154
run: |
6255
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0

.github/workflows/install-test.yml

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,8 @@ on:
2222
branches:
2323
- dev
2424
- main
25-
- "pull-request/[0-9]+"
26-
- "deploy-release/*"
25+
- 'pull-request/[0-9]+'
26+
- 'deploy-release/*'
2727
merge_group:
2828
types: [checks_requested]
2929

@@ -47,7 +47,7 @@ jobs:
4747
strategy:
4848
fail-fast: false
4949
matrix:
50-
python-version: ["3.12"]
50+
python-version: ['3.12']
5151
steps:
5252
- name: Checkout repository
5353
uses: actions/checkout@v4
@@ -93,7 +93,7 @@ jobs:
9393
strategy:
9494
fail-fast: false
9595
matrix:
96-
python-version: ["3.12"]
96+
python-version: ['3.12']
9797
steps:
9898
- name: Checkout repository
9999
uses: actions/checkout@v4
@@ -140,13 +140,16 @@ jobs:
140140
&& !cancelled()
141141
&& github.repository == 'NVIDIA/Megatron-LM'
142142
steps:
143+
- name: Checkout
144+
uses: actions/checkout@v4
145+
143146
- name: Get workflow result
144147
id: result
145148
shell: bash -x -e -u -o pipefail {0}
146149
env:
147150
GH_TOKEN: ${{ github.token }}
148151
RUN_ID: ${{ github.run_id }}
149-
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' }}
152+
SKIPPING_IS_ALLOWED: ${{ needs.pre-flight.outputs.docs_only == 'true' || needs.pre-flight.outputs.is_deployment_workflow == 'true' || needs.pre-flight.outputs.is_merge_group == 'true' }}
150153
run: |
151154
FAILED_JOBS=$(gh run view $GITHUB_RUN_ID --json jobs --jq '[.jobs[] | select(.status == "completed" and .conclusion != "success")] | length') || echo 0
152155

.github/workflows/multi-approval-bot.yml

Lines changed: 0 additions & 75 deletions
This file was deleted.

.gitlab/stages/01.build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ test:build_image:
5353
parallel:
5454
matrix:
5555
- IMAGE: CI_MCORE_LTS_IMAGE
56-
FILE: Dockerfile.ci.lts
57-
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
56+
FILE: Dockerfile.ci.dev
57+
IMAGE_TYPE: lts
58+
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
5859
- IMAGE: CI_MCORE_DEV_IMAGE
5960
FILE: Dockerfile.ci.dev
61+
IMAGE_TYPE: dev
6062
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
6163
- IMAGE: UTILITY_IMAGE
6264
FILE: Dockerfile.linting

.gitlab/stages/04.functional-tests.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -172,15 +172,15 @@ functional:run_dev_dgx_h100:
172172
functional:run_nemo:
173173
extends: [.functional_tests_rules]
174174
trigger:
175-
project: "dl/joc/nemo-ci"
175+
project: 'dl/joc/nemo-ci'
176176
branch: main-mirror
177177
strategy: depend
178178
inherit:
179179
variables: true
180180
variables:
181181
MCORE_MR_COMMIT: $CI_COMMIT_SHA
182-
TEST_NEMO2_MODULE: "True"
183-
ALLOW_FAILURE_DEPENDENCY: "True"
182+
TEST_NEMO2_MODULE: 'True'
183+
ALLOW_FAILURE_DEPENDENCY: 'True'
184184
TESTS_TO_RUN_ON_THIS_COMMIT: nightly
185185
rules:
186186
- if: $FUNCTIONAL_TEST == "yes"
@@ -217,7 +217,7 @@ functional:x_notify:
217217
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
218218
- export GITLAB_ENDPOINT
219219
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
220-
- export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
220+
- export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]] && echo "1" || "0")
221221
- export TEAM_SLUG=$SLACK_ADMIN
222222
- |
223223
python tests/test_utils/python_scripts/notify.py \
@@ -231,7 +231,7 @@ functional:x_notify:
231231
paths:
232232
- scripts
233233
rules:
234-
- if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main") && $FUNCTIONAL_TEST == "yes"
234+
- if: ($CI_PIPELINE_SOURCE == "schedule" || $CI_COMMIT_BRANCH == "main" || $CI_COMMIT_BRANCH == "dev") && $FUNCTIONAL_TEST == "yes"
235235
when: always
236236
- when: never
237237

CODEOWNERS

Lines changed: 0 additions & 26 deletions
This file was deleted.

LICENSE

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,6 @@
11
The following applies to all files unless otherwise noted:
22

3-
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
4-
#
3+
# Copyright (c) 2019-2025, NVIDIA CORPORATION. All rights reserved.
54
# Redistribution and use in source and binary forms, with or without
65
# modification, are permitted provided that the following conditions
76
# are met:

0 commit comments

Comments
 (0)