Skip to content

Commit 4b78163

Browse files
ko3n1gtdenedimapihtarAAnooshehsanthnm2
authored
Replay: [20251111] Ko3n1g/chore/main to dev (NVIDIA#2267)
Signed-off-by: dimapihtar <[email protected]> Signed-off-by: oliver könig <[email protected]> Signed-off-by: Asha Anoosheh <[email protected]> Signed-off-by: Keshav Santhanam <[email protected]> Signed-off-by: Evgeny <[email protected]> Signed-off-by: root <Evgeny> Co-authored-by: Teodor-Dumitru Ene <[email protected]> Co-authored-by: Dmytro Pykhtar <[email protected]> Co-authored-by: Asha Anoosheh <[email protected]> Co-authored-by: Keshav Santhanam <[email protected]> Co-authored-by: Teodor-Dumitru Ene <[email protected]> Co-authored-by: Robert Kirby <[email protected]> Co-authored-by: Lawrence McAfee <[email protected]> Co-authored-by: Robert Kirby <[email protected]> Co-authored-by: Mcore Bot <[email protected]> Co-authored-by: helen ngo <[email protected]> Co-authored-by: Evgeny Tsykunov <[email protected]>
2 parents 0bf9ff9 + 565202f commit 4b78163

File tree

141 files changed

+35891
-2289
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

141 files changed

+35891
-2289
lines changed

.github/copy-pr-bot.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
enabled: true
22
auto_sync_draft: false
33
auto_sync_ready: true
4-
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mkhona-nvidia", "pablo-garay", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]
4+
trustees_override: ["AAnoosheh", "ArEsKay3", "Autumn1998", "BestJuly", "BoxiangW", "ChenhanYu", "FDecaYed", "HaochenYuan", "ISEEKYAN", "JRD971000", "QiZhangNV", "ShriyaRishab", "Victarry", "Wohox", "ZhiyuLi-Nvidia", "aklife97", "ananthsub", "asolergi-nv", "buptzyb", "chtruong814", "cspades", "cuichenx", "deepakn94", "dimapihtar", "duncanriach", "erhoo82", "ericharper", "fanshiqing", "gautham-kollu", "hxbai", "jaredcasper", "jiemingz", "jkamalu", "jon-barker", "kanz-nv", "kevalmorabia97", "ko3n1g", "kunlunl", "kvareddy", "layalir", "lhb8125", "lmcafee-nvidia", "maanug-nv", "mathemakitten", "matthieule", "mehraakash", "mkhona-nvidia", "pablo-garay", "parthmannan", "pthombre", "rogerwaleffe", "sanandaraj5597", "santhnm2", "sbak5", "shanmugamr1992", "shifangx", "shjwudp", "sidsingh-nvidia", "skyw", "tdene", "theothermike", "thomasdhc", "trintamaki", "tylerpoon", "wdykas", "xiaoyao0115", "xuwchen", "yanring", "yaox12", "yaoyu-33", "yashaswikarnati", "yobibyte", "youngeunkwon0405", "yuzhongw-nvidia", "zhongbozhu"]

.github/workflows/auto-update-copy-pr-bot.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ jobs:
99
auto-update-copy-pr-bot:
1010
runs-on: ubuntu-latest
1111
environment: nemo-ci
12+
if: github.repository == 'NVIDIA/Megatron-LM'
1213
steps:
1314
- name: Checkout code
1415
uses: actions/checkout@v3

.github/workflows/cicd-main.yml

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -233,11 +233,26 @@ jobs:
233233
with:
234234
python-version: 3.12
235235

236+
- name: Install GH CLI
237+
shell: bash -x -e -u -o pipefail {0}
238+
run: |
239+
apt-get update
240+
apt-get install -y gh
241+
236242
- name: Get PR info
237243
id: get-pr-info
238244
if: startsWith(github.ref, 'refs/heads/pull-request/')
239245
uses: nv-gha-runners/get-pr-info@main
240246

247+
- name: Has lts label
248+
id: has-lts-label
249+
env:
250+
GH_TOKEN: ${{ secrets.PAT }}
251+
run: |
252+
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
253+
HAS_LTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "container::lts")') || echo "false"
254+
echo "main=$HAS_LTS_LABEL" | tee -a $GITHUB_OUTPUT
255+
241256
- name: Download test data
242257
shell: bash
243258
env:
@@ -276,6 +291,22 @@ jobs:
276291
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
277292
echo "EOF" | tee -a $GITHUB_OUTPUT
278293
294+
- name: Parse baseimage
295+
shell: bash
296+
id: base-image
297+
env:
298+
HAS_LTS_LABEL: ${{ steps.has-lts-label.outputs.main }}
299+
run: |
300+
if [ "$HAS_LTS_LABEL" == "true" ]; then
301+
NGC_VERSION=$(cat docker/.ngc_version.lts)
302+
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
303+
echo "image_type=lts" | tee -a $GITHUB_OUTPUT
304+
else
305+
NGC_VERSION=$(cat docker/.ngc_version.dev)
306+
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
307+
echo "image_type=dev" | tee -a $GITHUB_OUTPUT
308+
fi
309+
279310
- name: Set up Docker Buildx
280311
uses: docker/setup-buildx-action@v3
281312

@@ -287,7 +318,8 @@ jobs:
287318
context: .
288319
target: main
289320
build-args: |
290-
FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3
321+
FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
322+
IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
291323
cache-from: |
292324
type=registry,ref=${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
293325
type=registry,ref=${{ env.container-registry }}/megatron-lm:main-buildcache,mode=max

.github/workflows/multi-approval-bot.yml

Lines changed: 0 additions & 75 deletions
This file was deleted.

.gitlab/stages/01.build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ test:build_image:
5353
parallel:
5454
matrix:
5555
- IMAGE: CI_MCORE_LTS_IMAGE
56-
FILE: Dockerfile.ci.lts
57-
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
56+
FILE: Dockerfile.ci.dev
57+
IMAGE_TYPE: lts
58+
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
5859
- IMAGE: CI_MCORE_DEV_IMAGE
5960
FILE: Dockerfile.ci.dev
61+
IMAGE_TYPE: dev
6062
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
6163
- IMAGE: UTILITY_IMAGE
6264
FILE: Dockerfile.linting

.gitlab/stages/04.functional-tests.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -172,15 +172,15 @@ functional:run_dev_dgx_h100:
172172
functional:run_nemo:
173173
extends: [.functional_tests_rules]
174174
trigger:
175-
project: "dl/joc/nemo-ci"
175+
project: 'dl/joc/nemo-ci'
176176
branch: main-mirror
177177
strategy: depend
178178
inherit:
179179
variables: true
180180
variables:
181181
MCORE_MR_COMMIT: $CI_COMMIT_SHA
182-
TEST_NEMO2_MODULE: "True"
183-
ALLOW_FAILURE_DEPENDENCY: "True"
182+
TEST_NEMO2_MODULE: 'True'
183+
ALLOW_FAILURE_DEPENDENCY: 'True'
184184
TESTS_TO_RUN_ON_THIS_COMMIT: nightly
185185
rules:
186186
- if: $FUNCTIONAL_TEST == "yes"
@@ -217,7 +217,7 @@ functional:x_notify:
217217
- export RO_API_TOKEN=${PROJECT_ACCESS_TOKEN_MCORE}
218218
- export GITLAB_ENDPOINT
219219
- export CONTEXT=$FUNCTIONAL_TEST_SCOPE
220-
- export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" ]] && echo "1" || "0")
220+
- export TAG_TEAM=$([[ "$CI_COMMIT_BRANCH" == "main" || "$CI_COMMIT_BRANCH" == "dev" ]] && echo "1" || "0")
221221
- export TEAM_SLUG=$SLACK_ADMIN
222222
- |
223223
python tests/test_utils/python_scripts/notify.py \

CODEOWNERS

Lines changed: 0 additions & 26 deletions
This file was deleted.

docker/.ngc_version.dev

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nvcr.io/nvidia/pytorch:25.09-py3

docker/.ngc_version.lts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nvcr.io/nvidia/pytorch:25.09-py3

docker/Dockerfile.ci.dev

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# syntax=docker/dockerfile:1.3-labs
22

33
ARG FROM_IMAGE_NAME
4-
ARG WHEEL_DIR=/workspace/wheels
5-
64
FROM ${FROM_IMAGE_NAME} as main
75
ENV PIP_CONSTRAINT=""
86
ENV DEBIAN_FRONTEND=noninteractive
@@ -25,16 +23,16 @@ RUN bash -ex <<"EOF"
2523
curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
2624
EOF
2725

28-
ARG WHEEL_DIR
2926
COPY README.md pyproject.toml uv.lock /workspace/
3027
COPY megatron/core/__init__.py /workspace/megatron/core/
3128
COPY megatron/core/package_info.py /workspace/megatron/core/
29+
ARG IMAGE_TYPE=dev
3230
RUN --mount=type=cache,target=/root/.cache/uv \
3331
bash -ex <<"EOF"
3432
export NVTE_CUDA_ARCHS="80;90;100"
3533
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
3634
uv sync --only-group build
37-
uv sync --extra dev --extra mlm --link-mode copy --locked \
35+
uv sync --extra ${IMAGE_TYPE} --extra mlm --link-mode copy --locked \
3836
--no-install-package torch \
3937
--no-install-package torchvision \
4038
--no-install-package triton \

0 commit comments

Comments
 (0)