1- .test_mr_rules :
1+ .test_rules :
22 rules :
33 - if : $CI_PIPELINE_SOURCE == 'merge_request_event' && $CI_MERGE_REQUEST_TARGET_BRANCH_PROTECTED != "true"
44 allow_failure : true
99include :
1010 - template : Security/Secret-Detection.gitlab-ci.yml
1111
12- build_image :
13- extends : [.test_mr_rules ]
12+ test: build_image :
13+ extends : [.test_rules, .dind_rules ]
1414 tags :
1515 - ${TAG}
16- image : docker:26.1.4-dind
1716 timeout : 45m
1817 parallel :
1918 matrix :
20- - IMAGE : CI_MCORE_IMAGE
21- FILE : Dockerfile.ci
19+ - IMAGE : CI_MCORE_LTS_IMAGE
20+ FILE : Dockerfile.ci.lts
2221 BASE_IMAGE : nvcr.io/nvidia/pytorch:24.01-py3
2322 TAG : mcore-docker-node-large
2423 - IMAGE : CI_MCORE_DEV_IMAGE
2524 FILE : Dockerfile.ci.dev
2625 BASE_IMAGE : nvcr.io/nvidia/pytorch:24.07-py3
2726 TAG : mcore-docker-node-large
2827 - IMAGE : CI_NEMO_IMAGE
29- FILE : Dockerfile.ci
28+ FILE : Dockerfile.ci.lts
3029 BASE_IMAGE : nvcr.io/nvidian/nemo:nightly
3130 TAG : mcore-docker-node-large
3231 - IMAGE : LINTING_IMAGE
3332 FILE : Dockerfile.linting
3433 BASE_IMAGE : python:3.10
3534 TAG : mcore-docker-node-small
36- before_script :
37- - echo "$NGC_API_KEY" | docker login nvcr.io -u '$oauthtoken' --password-stdin
38- - echo "$CI_REGISTRY_PASSWORD" | docker login $CI_REGISTRY -u $CI_REGISTRY_USER --password-stdin
3935 variables :
4036 STAGE : main
4137 script :
@@ -45,8 +41,6 @@ build_image:
4541 set -x
4642 env
4743 eval "IMAGE=\$$IMAGE"
48-
49- docker system prune -a --filter "until=24h" -f || true
5044
5145 docker buildx create --name container --driver=docker-container
5246
@@ -61,13 +55,22 @@ build_image:
6155 ADDITIONAL_PARAMS+=("-t ${IMAGE}:nightly")
6256 fi
6357
58+ if [[ "$CI_PIPELINE_SOURCE" == "merge_request_event" ]]; then
59+ MCORE_REF=$(echo ${CI_MERGE_REQUEST_REF_PATH} | sed 's/head$/merge/')
60+ else
61+ MCORE_REF=$CI_COMMIT_SHA
62+ fi
63+
6464 DOCKER_BUILDKIT=1 docker build \
6565 --secret id=JET_INDEX_URLS \
6666 --target $STAGE \
6767 -f $FILE \
6868 -t ${IMAGE}:${CI_PIPELINE_ID} \
6969 --builder=container \
7070 --build-arg CACHEBUST=$(cat /proc/sys/kernel/random/uuid) \
71+ --build-arg MCORE_REPO=${CI_REPOSITORY_URL} \
72+ --build-arg MCORE_REF=${MCORE_REF} \
73+ --build-arg MCORE_BACKWARDS_REF="core_r0.9.0" \
7174 --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_PIPELINE_ID} \
7275 --cache-to type=registry,ref=${IMAGE}-buildcache:${CI_MERGE_REQUEST_IID:-noop} \
7376 --cache-from type=registry,ref=${IMAGE}-buildcache:main \
@@ -80,55 +83,37 @@ build_image:
8083 retry :
8184 max : 2
8285
83- unit_tests :
84- # This job runs both test suite of ToT and of a historic ref against
85- # the current code. This is a form of backwards compatibility testing
86- # and helps in providing stable interfaces.
87- extends : [.test_mr_rules]
88- image : ${IMAGE}:${CI_PIPELINE_ID}
89- needs : [build_image]
86+ .unit_tests :
87+ extends : [.test_rules, .dind_rules]
88+ needs : [test:build_image]
9089 timeout : 180m
91- parallel :
92- matrix :
93- - TAG : latest
94- IMAGE : ${CI_MCORE_IMAGE}
95- - TAG : latest
96- IMAGE : ${CI_MCORE_DEV_IMAGE}
97- - TAG : core_r0.9.0
98- IMAGE : ${CI_MCORE_IMAGE}
99- - TAG : core_r0.9.0
100- IMAGE : ${CI_MCORE_DEV_IMAGE}
10190 tags : [8xL40S]
10291 variables :
103- GIT_STRATEGY : clone
104- GIT_DEPTH : 0
105- before_script :
106- - |
107- if [[ $TAG != latest ]]; then
108- git checkout $TAG
109- rm -rf /opt/megatron-lm/tests
110- cp -r tests/ /opt/megatron-lm
111- fi
92+ GIT_STRATEGY : none
11293 script :
94+ - if [ $UNIT_TEST_REPEAT -eq 0 ]; then exit 0; fi;
95+ - docker run --name mcore_ci_${CI_PIPELINE_ID} -d --rm -e TAG -e UNIT_TEST_REPEAT -e UNIT_TEST_TIMEOUT --gpus all --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 ${IMAGE}:${CI_PIPELINE_ID} bash -c "sleep $(( ${UNIT_TEST_TIMEOUT} * 60 + 60 ))"
11396 - |
114- export NVTE_FLASH_ATTN=0
115- export NVTE_FUSED_ATTN=0
116-
117- cd /opt/megatron-lm
118- if [[ $UNIT_TEST_REPEAT -eq 0 ]]; then
119- exit 0
120- fi
121-
122- for i in $(seq $UNIT_TEST_REPEAT); do
123- SEED=$((RANDOM % 9000 + 1000));
124- ARGS=()
125- if [[ $TAG != latest ]]; then
126- ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
127- else
128- ARGS+=(-m "not flaky and not flaky_in_dev")
129- fi
130- timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
131- done
97+ docker exec mcore_ci_${CI_PIPELINE_ID} bash -c '
98+ set -e
99+
100+ MCORE_DIR=$([[ "$TAG" == "latest" ]] && echo "" || echo "-$TAG/")
101+
102+ cd /opt/megatron-lm$MCORE_DIR;
103+
104+ for i in $(seq $UNIT_TEST_REPEAT); do
105+ SEED=$((RANDOM % 9000 + 1000));
106+ ARGS=()
107+ if [[ $TAG != latest ]]; then
108+ ARGS+=(-m "not internal and not flaky and not flaky_in_dev")
109+ else
110+ ARGS+=(-m "not flaky and not flaky_in_dev")
111+ fi
112+ timeout ${UNIT_TEST_TIMEOUT}m torchrun --nproc_per_node=8 -m pytest --random-order --random-order-seed ${SEED} -xvs --cov-report=term --cov-report=html --cov=megatron/core --no-cov-on-fail "${ARGS[@]}" tests/unit_tests
113+ done
114+ '
115+ after_script :
116+ - docker container stop mcore_ci_${CI_PIPELINE_ID} || true
132117 artifacts :
133118 paths :
134119 - coverage
@@ -138,10 +123,38 @@ unit_tests:
138123 when : always
139124 - when : always
140125
141- unit-tests-results-notify :
142- extends : [.test_mr_rules]
143- image : ${CI_MCORE_IMAGE}:${CI_PIPELINE_ID}
144- needs : [unit_tests]
126+ test:pyt(LTS)_mcore(latest) :
127+ extends : [.unit_tests]
128+ variables :
129+ TAG : latest
130+ IMAGE : ${CI_MCORE_LTS_IMAGE}
131+
132+ test:pyt(LTS)_mcore(0.9.0) :
133+ extends : [.unit_tests]
134+ variables :
135+ TAG : core_r0.9.0
136+ IMAGE : ${CI_MCORE_LTS_IMAGE}
137+
138+ test:pyt(DEV)_mcore(latest) :
139+ extends : [.unit_tests]
140+ variables :
141+ TAG : latest
142+ IMAGE : ${CI_MCORE_DEV_IMAGE}
143+
144+ test:pyt(DEV)_mcore(0.9.0) :
145+ extends : [.unit_tests]
146+ variables :
147+ TAG : core_r0.9.0
148+ IMAGE : ${CI_MCORE_DEV_IMAGE}
149+
150+ test:notify :
151+ extends : [.test_rules]
152+ image : ${CI_MCORE_LTS_IMAGE}:${CI_PIPELINE_ID}
153+ needs :
154+ - test:pyt(LTS)_mcore(latest)
155+ - test:pyt(DEV)_mcore(latest)
156+ - test:pyt(LTS)_mcore(0.9.0)
157+ - test:pyt(DEV)_mcore(0.9.0)
145158 tags :
146159 - mcore-docker-node-small
147160 script :
@@ -160,39 +173,40 @@ unit-tests-results-notify:
160173 when : always
161174 - when : never
162175
163- docs_build_test :
164- extends : [.test_mr_rules ]
165- image : ${CI_MCORE_IMAGE }:${CI_PIPELINE_ID}
176+ test:docs_build :
177+ extends : [.test_rules ]
178+ image : ${CI_MCORE_LTS_IMAGE }:${CI_PIPELINE_ID}
166179 tags : [mcore-docker-node-small]
167- needs : [build_image]
180+ needs : [test: build_image]
168181 script :
169182 - cd ..
170183 - rm -rf documentation && git clone https://gitlab-ci-token:${CI_JOB_TOKEN}@${GITLAB_ENDPOINT}/nemo-megatron-core-tme/documentation.git
171184 - mv megatron-lm/ documentation/
172185 - cd documentation/
173186 - ./repo docs
174187
175- formatting :
176- extends : [.test_mr_rules ]
188+ test: formatting :
189+ extends : [.test_rules ]
177190 image : ${LINTING_IMAGE}:${CI_PIPELINE_ID}
178191 tags : [mcore-docker-node-small]
179- needs : [build_image]
192+ needs : [test: build_image]
180193 script :
181194 - env
182195 - git fetch origin main
183196 - BASE_REF="$CI_MERGE_REQUEST_TARGET_BRANCH_NAME" CHECK_ONLY=true SKIP_DOCS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Skip docs"* ]] && echo "true" || echo "false") bash tools/autoformat.sh
184197
185- copyright :
186- extends : [.test_mr_rules ]
198+ test: copyright :
199+ extends : [.test_rules ]
187200 tags : [mcore-docker-node-small]
188- image : ${CI_MCORE_IMAGE }:${CI_PIPELINE_ID}
189- needs : [build_image]
201+ image : ${CI_MCORE_LTS_IMAGE }:${CI_PIPELINE_ID}
202+ needs : [test: build_image]
190203 script :
191204 - git fetch origin main
192205 - bash tools/copyright.sh
193206
194- secret_detection :
207+ test: secret_detection :
195208 tags : [mcore-docker-node-small]
209+ extends : " .secret-analyzer"
196210 variables :
197211 GIT_DEPTH : 0
198212 SECRET_DETECTION_LOG_OPTIONS : ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
0 commit comments