Skip to content

Commit 23a1dca

Browse files
authored
replay: ci: Bump LTS container (#2157)
Signed-off-by: oliver könig <[email protected]>
1 parent f167a85 commit 23a1dca

File tree

88 files changed

+34082
-1945
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

88 files changed

+34082
-1945
lines changed

.github/workflows/cicd-main.yml

Lines changed: 33 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,11 +232,26 @@ jobs:
232232
with:
233233
python-version: 3.12
234234

235+
- name: Install GH CLI
236+
shell: bash -x -e -u -o pipefail {0}
237+
run: |
238+
apt-get update
239+
apt-get install -y gh
240+
235241
- name: Get PR info
236242
id: get-pr-info
237243
if: startsWith(github.ref, 'refs/heads/pull-request/')
238244
uses: nv-gha-runners/get-pr-info@main
239245

246+
- name: Has lts label
247+
id: has-lts-label
248+
env:
249+
GH_TOKEN: ${{ secrets.PAT }}
250+
run: |
251+
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
252+
HAS_LTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "container::lts")') || echo "false"
253+
echo "main=$HAS_LTS_LABEL" | tee -a $GITHUB_OUTPUT
254+
240255
- name: Download test data
241256
shell: bash
242257
env:
@@ -275,6 +290,22 @@ jobs:
275290
echo "$LAST_PRS" | tee -a $GITHUB_OUTPUT
276291
echo "EOF" | tee -a $GITHUB_OUTPUT
277292
293+
- name: Parse baseimage
294+
shell: bash
295+
id: base-image
296+
env:
297+
HAS_LTS_LABEL: ${{ steps.has-lts-label.outputs.main }}
298+
run: |
299+
if [ "$HAS_LTS_LABEL" == "true" ]; then
300+
NGC_VERSION=$(cat docker/.ngc_version.lts)
301+
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
302+
echo "image_type=lts" | tee -a $GITHUB_OUTPUT
303+
else
304+
NGC_VERSION=$(cat docker/.ngc_version.dev)
305+
echo "version=$NGC_VERSION" | tee -a $GITHUB_OUTPUT
306+
echo "image_type=dev" | tee -a $GITHUB_OUTPUT
307+
fi
308+
278309
- name: Set up Docker Buildx
279310
uses: docker/setup-buildx-action@v3
280311

@@ -286,7 +317,8 @@ jobs:
286317
context: .
287318
target: main
288319
build-args: |
289-
FROM_IMAGE_NAME=nvcr.io/nvidia/pytorch:25.09-py3
320+
FROM_IMAGE_NAME=${{ steps.base-image.outputs.version }}
321+
IMAGE_TYPE=${{ steps.base-image.outputs.image_type }}
290322
cache-from: |
291323
type=registry,ref=${{ env.container-registry }}/megatron-lm:${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number || 0 }}-buildcache,mode=max
292324
type=registry,ref=${{ env.container-registry }}/megatron-lm:main-buildcache,mode=max

.gitlab/stages/01.build.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -53,10 +53,12 @@ test:build_image:
5353
parallel:
5454
matrix:
5555
- IMAGE: CI_MCORE_LTS_IMAGE
56-
FILE: Dockerfile.ci.lts
57-
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
56+
FILE: Dockerfile.ci.dev
57+
IMAGE_TYPE: lts
58+
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
5859
- IMAGE: CI_MCORE_DEV_IMAGE
5960
FILE: Dockerfile.ci.dev
61+
IMAGE_TYPE: dev
6062
BASE_IMAGE: nvcr.io/nvidia/pytorch:25.09-py3
6163
- IMAGE: UTILITY_IMAGE
6264
FILE: Dockerfile.linting

docker/.ngc_version.dev

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nvcr.io/nvidia/pytorch:25.09-py3

docker/.ngc_version.lts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
nvcr.io/nvidia/pytorch:25.09-py3

docker/Dockerfile.ci.dev

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# syntax=docker/dockerfile:1.3-labs
22

33
ARG FROM_IMAGE_NAME
4-
ARG WHEEL_DIR=/workspace/wheels
5-
64
FROM ${FROM_IMAGE_NAME} as main
75
ENV PIP_CONSTRAINT=""
86
ENV DEBIAN_FRONTEND=noninteractive
@@ -25,16 +23,16 @@ RUN bash -ex <<"EOF"
2523
curl -LsSf https://astral.sh/uv/${UV_VERSION}/install.sh | sh
2624
EOF
2725

28-
ARG WHEEL_DIR
2926
COPY README.md pyproject.toml uv.lock /workspace/
3027
COPY megatron/core/__init__.py /workspace/megatron/core/
3128
COPY megatron/core/package_info.py /workspace/megatron/core/
29+
ARG IMAGE_TYPE=dev
3230
RUN --mount=type=cache,target=/root/.cache/uv \
3331
bash -ex <<"EOF"
3432
export NVTE_CUDA_ARCHS="80;90;100"
3533
uv venv ${UV_PROJECT_ENVIRONMENT} --system-site-packages
3634
uv sync --only-group build
37-
uv sync --extra dev --extra mlm --link-mode copy --locked \
35+
uv sync --extra ${IMAGE_TYPE} --extra mlm --link-mode copy --locked \
3836
--no-install-package torch \
3937
--no-install-package torchvision \
4038
--no-install-package triton \

docker/Dockerfile.ci.lts

Lines changed: 0 additions & 98 deletions
This file was deleted.

pyproject.toml

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -69,7 +69,7 @@ mlm = ["flask-restful", "sentencepiece", "tiktoken", "wandb", "transformers"]
6969
dev = [
7070
"nvidia-modelopt[torch]; sys_platform != 'darwin'",
7171
"transformer-engine[pytorch]>=2.9.0a0,<2.10.0",
72-
"nvidia-resiliency-ext>=0.4.0a0,<0.5.0",
72+
"nvidia-resiliency-ext",
7373
"tqdm",
7474
"einops~=0.8",
7575
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
@@ -89,13 +89,20 @@ dev = [
8989

9090
lts = [
9191
"tqdm",
92-
"einops",
93-
"tensorstore!=0.1.46,!=0.1.72",
94-
"nvtx",
95-
"transformers",
96-
"zarr",
92+
"einops~=0.8",
93+
"tensorstore~=0.1,!=0.1.46,!=0.1.72",
94+
"nvtx~=0.2",
95+
"multi-storage-client~=0.27",
96+
"opentelemetry-api~=1.33.1",
9797
"setuptools<80.0.0",
98+
"mamba-ssm~=2.2",
99+
"causal-conv1d~=1.5",
100+
"nv-grouped-gemm~=1.1",
101+
"megatron-energon[av_decode]~=6.0",
102+
"av<16.0.0", # At the time, av 16.0.0 is not compatible with Python 3.12
103+
"flashinfer-python",
98104
"wget",
105+
"onnxscript",
99106
]
100107

101108
[dependency-groups]

0 commit comments

Comments
 (0)