Skip to content

Commit 55622ff

Browse files
committed
Merge branch 'ko3n1g/ci/refactor-jobs' into 'main'
ci(refactor): Facelift gitlab-ci See merge request ADLR/megatron-lm!2223
2 parents 0d89fc4 + 33d2f45 commit 55622ff

File tree

18 files changed

+231
-2047
lines changed

18 files changed

+231
-2047
lines changed

.gitlab-ci.yml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,23 +14,23 @@ workflow:
1414
- if: $CI_MERGE_REQUEST_LABELS =~ /Run tests/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
1515
variables:
1616
UNIT_TEST_REPEAT: 5
17-
UNIT_TEST_TIMEOUT: 50
17+
UNIT_TEST_TIMEOUT: 75
1818
FUNCTIONAL_TEST: "yes"
1919
FUNCTIONAL_TEST_SCOPE: mr
2020
FUNCTIONAL_TEST_CLUSTER_A100: ""
2121
FUNCTIONAL_TEST_CLUSTER_H100: ""
2222
- if: $CI_MERGE_REQUEST_LABELS =~ /Run nightly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
2323
variables:
2424
UNIT_TEST_REPEAT: 5
25-
UNIT_TEST_TIMEOUT: 50
25+
UNIT_TEST_TIMEOUT: 75
2626
FUNCTIONAL_TEST: "yes"
2727
FUNCTIONAL_TEST_SCOPE: nightly
2828
FUNCTIONAL_TEST_CLUSTER_A100: ""
2929
FUNCTIONAL_TEST_CLUSTER_H100: ""
3030
- if: $CI_MERGE_REQUEST_LABELS =~ /Run weekly/ && $CI_MERGE_REQUEST_TARGET_BRANCH_SHA != ""
3131
variables:
3232
UNIT_TEST_REPEAT: 5
33-
UNIT_TEST_TIMEOUT: 50
33+
UNIT_TEST_TIMEOUT: 75
3434
FUNCTIONAL_TEST: "yes"
3535
FUNCTIONAL_TEST_SCOPE: weekly
3636
FUNCTIONAL_TEST_CLUSTER_A100: ""
@@ -95,7 +95,7 @@ variables:
9595
description: Type of publish (freeze or final release)
9696

9797
# CI wide variables
98-
CI_MCORE_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci
98+
CI_MCORE_LTS_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_lts
9999
CI_MCORE_DEV_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci_dev
100100
CI_NEMO_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/nemo_ci
101101
LINTING_IMAGE: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_linting
@@ -104,6 +104,6 @@ variables:
104104

105105
include:
106106
- .gitlab/stages/00.pre.yml
107-
- .gitlab/stages/01.tests.yml
107+
- .gitlab/stages/01.test.yml
108108
- .gitlab/stages/02.functional-tests.yml
109109
- .gitlab/stages/03.publish.yml

.gitlab/labeler-config.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
CI:
22
- .gitlab-ci.yml
3-
- Dockerfile.ci
4-
- jet-tests.yml
3+
- Dockerfile.ci.lts
4+
- Dockerfile.ci.dev
5+
- .github/**
6+
- .gitlab/**
57

68
Datasets:
79
- megatron/core/datasets/**

.gitlab/stages/00.pre.yml

Lines changed: 22 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
include:
22
- template: Security/Secret-Detection.gitlab-ci.yml
33

4-
.pre_mr_rules:
4+
.pre_rules:
55
rules:
66
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
77
allow_failure: true
@@ -10,7 +10,16 @@ include:
1010
- when: never
1111
stage: .pre
1212

13-
mirror_to_github:
13+
.dind_rules:
14+
image: docker:26.1.4-dind
15+
variables:
16+
DOCKER_HOST: unix:///var/run/docker.sock
17+
before_script:
18+
- docker system prune -a --filter "until=36h" -f || true
19+
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
20+
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
21+
22+
pre:mirror_to_github:
1423
rules:
1524
- if: '$CI_COMMIT_REF_PROTECTED == "true" && $CI_PIPELINE_SOURCE == "push"'
1625
- when: never
@@ -24,7 +33,7 @@ mirror_to_github:
2433
- git remote add github https://ko3n1g:[email protected]/NVIDIA/Megatron-LM.git || true
2534
- git push -u github $CI_COMMIT_BRANCH
2635

27-
create_ci_branches:
36+
pre:create_ci_branches:
2837
rules:
2938
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
3039
- when: never
@@ -47,8 +56,8 @@ create_ci_branches:
4756
- git switch --force-create $branch
4857
- git push --force -u origin $branch
4958

50-
label_merge_request:
51-
extends: [.pre_mr_rules]
59+
pre:label_merge_request:
60+
extends: [.pre_rules]
5261
image: golang:1.22
5362
tags:
5463
- mcore-docker-node-small
@@ -67,21 +76,17 @@ label_merge_request:
6776
source labels
6877
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
6978
70-
clean_docker_node:
71-
extends: [.pre_mr_rules]
72-
image: docker:26.1.4-dind
79+
pre:clean_docker_node:
80+
extends: [.pre_rules, .dind_rules]
7381
tags:
7482
- ${node}
7583
parallel:
7684
matrix:
77-
- node: 8xL40S
7885
- node: mcore-docker-node-small
79-
- node: mcore-docker-node-jet
80-
script:
81-
- export DOCKER_HOST='unix:///var/run/docker.sock'
82-
- docker system prune -a --filter "until=36h" -f || true
86+
- node: mcore-docker-node-large
87+
script: ':'
8388

84-
maybe_cherry_pick_commit:
89+
pre:maybe_cherry_pick_commit:
8590
rules:
8691
- if: '$CI_COMMIT_BRANCH == $CI_DEFAULT_BRANCH && $CI_PIPELINE_SOURCE == "push"'
8792
- when: never
@@ -168,11 +173,10 @@ maybe_cherry_pick_commit:
168173
done
169174
interruptible: false
170175

171-
check_milestone:
172-
extends: [.pre_mr_rules]
176+
pre:check_milestone:
177+
extends: [.pre_rules]
173178
image: ${GITLAB_ENDPOINT}:5005/adlr/megatron-lm/mcore_ci:buildcache
174-
tags:
175-
- mcore-docker-node-small
179+
tags: [mcore-docker-node-small]
176180
script:
177181
- env
178182
- |

.gitlab/stages/01.tests.yml renamed to .gitlab/stages/01.test.yml

Lines changed: 86 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.test_mr_rules:
1+
.test_rules:
22
rules:
33
- if: $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
44
allow_failure: true
@@ -9,33 +9,29 @@
99
include:
1010
- template: Security/Secret-Detection.gitlab-ci.yml
1111

12-
build_image:
13-
extends: [.test_mr_rules]
12+
test:build_image:
13+
extends: [.test_rules, .dind_rules]
1414
tags:
1515
- ${TAG}
16-
image: docker:26.1.4-dind
1716
timeout: 45m
1817
parallel:
1918
matrix:
20-
- IMAGE: CI_MCORE_IMAGE
21-
FILE: Dockerfile.ci
19+
- IMAGE: CI_MCORE_LTS_IMAGE
20+
FILE: Dockerfile.ci.lts
2221
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.01-py3
2322
TAG: mcore-docker-node-large
2423
- IMAGE: CI_MCORE_DEV_IMAGE
2524
FILE: Dockerfile.ci.dev
2625
BASE_IMAGE: nvcr.io/nvidia/pytorch:24.07-py3
2726
TAG: mcore-docker-node-large
2827
- IMAGE: CI_NEMO_IMAGE
29-
FILE: Dockerfile.ci
28+
FILE: Dockerfile.ci.lts
3029
BASE_IMAGE: nvcr.io/nvidian/nemo:nightly
3130
TAG: mcore-docker-node-large
3231
- IMAGE: LINTING_IMAGE
3332
FILE: Dockerfile.linting
3433
BASE_IMAGE: python:3.10
3534
TAG: mcore-docker-node-small
36-
before_script:
37-
- echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
38-
- echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
3935
variables:
4036
STAGE: main
4137
script:
@@ -45,8 +41,6 @@ build_image:
4541
set -x
4642
env
4743
eval "IMAGE=\$$IMAGE"
48-
49-
docker system prune -a --filter "until=24h" -f || true
5044
5145
docker buildx create --name container --driver=docker-container
5246
@@ -61,13 +55,22 @@ build_image:
6155
ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
6256
fi
6357
58+
if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" ]]; then
59+
MCORE_REF=$(echo ${CI_MERGE_REQUEST_REF_PATH} | sed 's/head$/merge/')
60+
else
61+
MCORE_REF=$CI_COMMIT_SHA
62+
fi
63+
6464
DOCKER_BUILDKIT=1 docker build \
6565
--secret id=JET_INDEX_URLS \
6666
--target $STAGE \
6767
-f $FILE \
6868
-t ${IMAGE}:${CI_PIPELINE_ID} \
6969
--builder=container \
7070
--build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
71+
--build-arg MCORE_REPO=${CI_REPOSITORY_URL} \
72+
--build-arg MCORE_REF=${MCORE_REF} \
73+
--build-arg MCORE_BACKWARDS_REF="core_r0.9.0" \
7174
--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
7275
--cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
7376
--cache-from type=registry,ref=${IMAGE}-buildcache:main \
@@ -80,55 +83,37 @@ build_image:
8083
retry:
8184
max: 2
8285

83-
unit_tests:
84-
# This job runs both test suite of ToT and of a historic ref against
85-
# the current code. This is a form of backwards compatibility testing
86-
# and helps in providing stable interfaces.
87-
extends: [.test_mr_rules]
88-
image: ${IMAGE}:${CI_PIPELINE_ID}
89-
needs: [build_image]
86+
.unit_tests:
87+
extends: [.test_rules, .dind_rules]
88+
needs: [test:build_image]
9089
timeout: 180m
91-
parallel:
92-
matrix:
93-
- TAG: latest
94-
IMAGE: ${CI_MCORE_IMAGE}
95-
- TAG: latest
96-
IMAGE: ${CI_MCORE_DEV_IMAGE}
97-
- TAG: core_r0.9.0
98-
IMAGE: ${CI_MCORE_IMAGE}
99-
- TAG: core_r0.9.0
100-
IMAGE: ${CI_MCORE_DEV_IMAGE}
10190
tags: [8xL40S]
10291
variables:
103-
GIT_STRATEGY: clone
104-
GIT_DEPTH: 0
105-
before_script:
106-
- |
107-
if [[ $TAG != latest ]]; then
108-
git checkout $TAG
109-
rm -rf /opt/megatron-lm/tests
110-
cp -r tests/ /opt/megatron-lm
111-
fi
92+
GIT_STRATEGY: none
11293
script:
94+
- if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi;
95+
- docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
11396
- |
114-
export NVTE_FLASH_ATTN=0
115-
export NVTE_FUSED_ATTN=0
116-
117-
cd /opt/megatron-lm
118-
if [[ $UNIT_TEST_REPEAT -eq 0 ]]; then
119-
exit 0
120-
fi
121-
122-
for i in $(seq $UNIT_TEST_REPEAT); do
123-
SEED=$((RANDOM % 9000 + 1000));
124-
ARGS=()
125-
if [[ $TAG != latest ]]; then
126-
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
127-
else
128-
ARGS+=(-m "not flaky and not flaky_in_dev")
129-
fi
130-
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
131-
done
97+
docker exec mcore_ci_${CI_PIPELINE_ID} bash -c '
98+
set -e
99+
100+
MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
101+
102+
cd /opt/megatron-lm$MCORE_DIR;
103+
104+
for i in $(seq $UNIT_TEST_REPEAT); do
105+
SEED=$((RANDOM % 9000 + 1000));
106+
ARGS=()
107+
if [[ $TAG != latest ]]; then
108+
ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
109+
else
110+
ARGS+=(-m "not flaky and not flaky_in_dev")
111+
fi
112+
timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
113+
done
114+
'
115+
after_script:
116+
- docker container stop mcore_ci_${CI_PIPELINE_ID} || true
132117
artifacts:
133118
paths:
134119
- coverage
@@ -138,10 +123,38 @@ unit_tests:
138123
when: always
139124
- when: always
140125

141-
unit-tests-results-notify:
142-
extends: [.test_mr_rules]
143-
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
144-
needs: [unit_tests]
126+
test:pyt(LTS)_mcore(latest):
127+
extends: [.unit_tests]
128+
variables:
129+
TAG: latest
130+
IMAGE: ${CI_MCORE_LTS_IMAGE}
131+
132+
test:pyt(LTS)_mcore(0.9.0):
133+
extends: [.unit_tests]
134+
variables:
135+
TAG: core_r0.9.0
136+
IMAGE: ${CI_MCORE_LTS_IMAGE}
137+
138+
test:pyt(DEV)_mcore(latest):
139+
extends: [.unit_tests]
140+
variables:
141+
TAG: latest
142+
IMAGE: ${CI_MCORE_DEV_IMAGE}
143+
144+
test:pyt(DEV)_mcore(0.9.0):
145+
extends: [.unit_tests]
146+
variables:
147+
TAG: core_r0.9.0
148+
IMAGE: ${CI_MCORE_DEV_IMAGE}
149+
150+
test:notify:
151+
extends: [.test_rules]
152+
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
153+
needs:
154+
- test:pyt(LTS)_mcore(latest)
155+
- test:pyt(DEV)_mcore(latest)
156+
- test:pyt(LTS)_mcore(0.9.0)
157+
- test:pyt(DEV)_mcore(0.9.0)
145158
tags:
146159
- mcore-docker-node-small
147160
script:
@@ -160,39 +173,40 @@ unit-tests-results-notify:
160173
when: always
161174
- when: never
162175

163-
docs_build_test:
164-
extends: [.test_mr_rules]
165-
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
176+
test:docs_build:
177+
extends: [.test_rules]
178+
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
166179
tags: [mcore-docker-node-small]
167-
needs: [build_image]
180+
needs: [test:build_image]
168181
script:
169182
- cd ..
170183
- rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
171184
- mv megatron-lm/ documentation/
172185
- cd documentation/
173186
- ./repo docs
174187

175-
formatting:
176-
extends: [.test_mr_rules]
188+
test:formatting:
189+
extends: [.test_rules]
177190
image: ${LINTING_IMAGE}:${CI_PIPELINE_ID}
178191
tags: [mcore-docker-node-small]
179-
needs: [build_image]
192+
needs: [test:build_image]
180193
script:
181194
- env
182195
- git fetch origin main
183196
- BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
184197

185-
copyright:
186-
extends: [.test_mr_rules]
198+
test:copyright:
199+
extends: [.test_rules]
187200
tags: [mcore-docker-node-small]
188-
image: ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
189-
needs: [build_image]
201+
image: ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
202+
needs: [test:build_image]
190203
script:
191204
- git fetch origin main
192205
- bash tools/copyright.sh
193206

194-
secret_detection:
207+
test:secret_detection:
195208
tags: [mcore-docker-node-small]
209+
extends: ".secret-analyzer"
196210
variables:
197211
GIT_DEPTH: 0
198212
SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}

0 commit comments

Comments
 (0)