Skip to content

Commit 52f7fd7

Browse files
committed
Merge remote-tracking branch 'upstream/main' into fp8_sequence_parallel
Signed-off-by: Keshav Santhanam <[email protected]>
2 parents d808109 + f110cd0 commit 52f7fd7

File tree

1,046 files changed

+19661
-5513
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,046 files changed

+19661
-5513
lines changed

.github/CODEOWNERS

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
megatron/core @NVIDIA/core-adlr @NVIDIA/core-nemo
2+
3+
megatron/core/models/gpt/ @NVIDIA/gpt
4+
5+
megatron/core/models/multimodal/ @NVIDIA/multi-modal
6+
7+
megatron/core/models/mamba/ @NVIDIA/hybrid-mamba
8+
9+
megatron/core/dist_checkpointing/ @NVIDIA/dist-checkpointing
10+
11+
megatron/core/optimizer/distrib_optimizer/ @NVIDIA/dist-optimizer
12+
13+
megatron/core/inference/modelopt_support @NVIDIA/quantization-and-inference
14+
15+
# megatron/core/datasets/ @NVIDIA/datasets
16+
17+
megatron/core/pipeline_parallel/ @NVIDIA/pipeline-parallelism
18+
19+
megatron/core/transformer/ @NVIDIA/core-adlr @NVIDIA/core-nemo
20+
21+
megatron/core/transformer/moe/ @NVIDIA/core-adlr @NVIDIA/core-devtech
22+
23+
# megatron/core/inference/ @NVIDIA/inference
24+
25+
megatron/core/parallel_state.py @NVIDIA/core-nemo
26+
27+
megatron/core/post_training/ @NVIDIA/post-training
28+
megatron/post_training @NVIDIA/post-training
29+
30+
.gitlab/ @NVIDIA/ci
31+
.github/ @NVIDIA/ci
32+
.gitlab-ci.yml @NVIDIA/ci
33+
docker/ @NVIDIA/ci
34+
tests/unit_tests/run_ci_test.sh @NVIDIA/ci
35+
tests/test_utils/python_scripts/
36+
tests/functional_tests/python_test_utils/ @NVIDIA/ci
37+
tests/functional_tests/shell_test_utils/ @NVIDIA/ci
38+
megatron/core/transformer/transformer_block.py @NVIDIA/ci
39+
megatron/core/transformer/transformer_layer.py @NVIDIA/ci
40+
tests/functional_tests/test_cases/ @NVIDIA/ci
41+
tests/functional_tests/recipes/ @NVIDIA/ci
42+
tests/unit_tests/ @NVIDIA/ci
43+
44+
megatron/rl/ @NVIDIA/reinforcement-learning
45+
examples/rl/ @NVIDIA/reinforcement-learning
46+
test/unit_tests/test_rl_utils.py @NVIDIA/reinforcement-learning
47+
train_rl.py @NVIDIA/reinforcement-learning

.github/actions/action.yml

Lines changed: 85 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,9 @@ name: "Test Template"
1515
description: "Template for running NeMo tests in a containerized environment"
1616

1717
inputs:
18+
container-image:
19+
description: "Container image to use for test"
20+
required: true
1821
timeout:
1922
description: "Max runtime of test in minutes"
2023
required: false
@@ -46,210 +49,140 @@ inputs:
4649
runs:
4750
using: "composite"
4851
steps:
49-
- name: Copy data
50-
shell: bash
51-
if: inputs.is_unit_test == 'false'
52-
env:
53-
SOURCE_DIR: /mnt/datadrive/TestData/megatron-lm/artifacts
54-
TARGET_DIR: /home/runner/_work/TestData/megatron-lm/artifacts
55-
MODEL: ${{ inputs.model }}
56-
run: |
57-
mkdir -p $TARGET_DIR/text/data/
58-
59-
if [[ "$MODEL" == "bert" ]]; then
60-
mkdir -p $TARGET_DIR/text/the_pile/bert_shard00/
61-
cp -a $SOURCE_DIR/text/the_pile/bert_shard00/. $TARGET_DIR/text/data/
62-
elif [[ "$MODEL" == "gpt" ]] || [[ "$MODEL" == "moe" ]]; then
63-
cp -a $SOURCE_DIR/text/the_pile/shard00/. $TARGET_DIR/text/data/
64-
fi
65-
66-
- name: Install curl, sudo
67-
shell: bash
68-
run: |
69-
sudo apt-get update
70-
sudo apt-get install -y curl uuid-runtime
71-
7252
- name: Checkout repository
7353
uses: actions/checkout@v2
74-
with:
75-
path: ${{ github.workspace }}/Megatron-LM
7654

77-
- name: Cache uv
78-
uses: actions/cache@v4
79-
id: cache
80-
with:
81-
path: cache-mount
82-
key: ${{ runner.os }}-uv-${{ hashFiles('**/uv.lock') }}
83-
restore-keys: |
84-
${{ runner.os }}-uv-
85-
86-
- name: Restore Docker cache mounts
87-
uses: reproducible-containers/buildkit-cache-dance@5b81f4d29dc8397a7d341dba3aeecc7ec54d6361
88-
with:
89-
cache-dir: cache-mount
90-
dockerfile: docker/Dockerfile.ci.dev
91-
skip-extraction: ${{ steps.cache.outputs.cache-hit }}
55+
- name: Change ownership of /home/runner/
56+
shell: bash
57+
run: sudo chown -R $(whoami) /home/runner/
9258

9359
- name: Setup python
9460
uses: actions/setup-python@v5
9561
with:
9662
python-version: 3.12
9763

98-
- name: Download test data
99-
shell: bash
100-
env:
101-
GH_TOKEN: ${{ inputs.PAT }}
102-
TIMEOUT: ${{ inputs.timeout }}
103-
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
64+
- name: Install uuidgen
65+
shell: bash -x -e -u -o pipefail {0}
10466
run: |
105-
echo "::group::Download test data"
106-
pip install --no-cache-dir pygithub click
107-
python tests/test_utils/python_scripts/download_unit_tests_dataset.py --assets-dir ./assets
108-
echo "::endgroup::"
67+
apt-get update
68+
apt-get install -y uuid-runtime
10969
11070
- name: Create run-script (unit test)
111-
shell: bash
71+
shell: bash -x -e -u -o pipefail {0}
11272
if: inputs.is_unit_test == 'true'
11373
run: |
11474
echo "::group::Create run-script"
11575
cmd=$(cat <<'RUN_TEST_EOF'
11676
#!/bin/bash
11777
118-
docker exec -t test_container_${{ github.run_id }} bash -c '
119-
set -e
120-
bash /opt/megatron-lm/tests/unit_tests/run_ci_test.sh \
121-
--tag ${{ inputs.tag }} \
122-
--environment dev \
123-
--bucket '\''${{ inputs.test_case }}'\'' \
124-
--log-dir /opt/megatron-lm/outputs/logs
125-
'
78+
export PYTHONPATH=$(pwd)
79+
export NEMORUN_HOME=$(pwd)
80+
pip install --no-cache-dir uv
81+
uv sync --only-group test
82+
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
83+
--scope unit-tests \
84+
--model unit-tests \
85+
--test-case "${{ inputs.test_case }}" \
86+
--environment dev \
87+
--platform dgx_h100 \
88+
--tag ${{ inputs.tag }} \
89+
--container-image ${{ inputs.container-image }}
12690
12791
RUN_TEST_EOF
12892
)
12993
echo "$cmd" | tee "job.sh"
13094
echo "::endgroup::"
13195
96+
- name: Get PR info
97+
id: get-pr-info
98+
if: startsWith(github.ref, 'refs/heads/pull-request/')
99+
uses: nv-gha-runners/get-pr-info@main
100+
101+
- name: Install GH CLI
102+
shell: bash -x -e -u -o pipefail {0}
103+
run: |
104+
apt-get update
105+
apt-get install -y gh
106+
107+
- name: Has Run tests label
108+
shell: bash -x -e -u -o pipefail {0}
109+
id: has-run-tests-label
110+
env:
111+
GH_TOKEN: ${{ github.token }}
112+
run: |
113+
PR_NUMBER=${{ fromJSON(steps.get-pr-info.outputs.pr-info || '{}').number }}
114+
HAS_RUN_TESTS_LABEL=$(gh pr view $PR_NUMBER --json labels | jq '[.labels[].name] | any(. == "Run tests")') || echo "false"
115+
echo "main=$HAS_RUN_TESTS_LABEL" | tee -a $GITHUB_OUTPUT
116+
132117
- name: Create run-script (e2e test)
133-
shell: bash
118+
shell: bash -x -e -u -o pipefail {0}
134119
if: inputs.is_unit_test == 'false'
135120
env:
136121
MODEL: ${{ inputs.model }}
137122
run: |
138123
echo "::group::Create run-script"
139124
cmd=$(cat <<'RUN_TEST_EOF'
140125
#!/bin/bash
141-
142-
143-
144-
docker exec -t test_container_${{ github.run_id }} bash -c '
145-
146-
set -e
147-
ls -al /workspace/data
148-
149-
if [[ "${{ inputs.model }}" == "bert" ]]; then
150-
TRAINING_SCRIPT_PATH=pretrain_bert.py
151-
elif [[ "${{ inputs.model }}" == "gpt" ]] || [[ "${{ inputs.model }}" == "moe" ]]; then
152-
TRAINING_SCRIPT_PATH=pretrain_gpt.py
153-
fi
154-
155-
ARGUMENTS=(
156-
"DATA_PATH=/workspace/data"
157-
"DATA_CACHE_PATH=/workspace/data/cache"
158-
"OUTPUT_PATH=$(pwd)/outputs/"
159-
"TENSORBOARD_PATH=$(pwd)/tensorboard"
160-
"CHECKPOINT_SAVE_PATH=$(pwd)/checkpoints"
161-
"CHECKPOINT_LOAD_PATH=/workspace/checkpoints/$NAME"
162-
"TRAINING_SCRIPT_PATH=$TRAINING_SCRIPT_PATH"
163-
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/model_config.yaml"
164-
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/${{inputs.model}}/${{inputs.test_case}}/golden_values_dev_dgx_h100.json"
165-
"N_REPEAT=5"
166-
"ENABLE_LIGHTWEIGHT_MODE=false"
167-
"RECORD_CHECKPOINTS=false"
168-
)
169-
170-
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${ARGUMENTS[@]}
171-
'
126+
set -euxo pipefail
127+
128+
export PYTHONPATH=$(pwd)
129+
export NEMORUN_HOME=$(pwd)
130+
pip install --no-cache-dir uv
131+
uv sync --only-group test
132+
uv run python tests/test_utils/python_scripts/launch_nemo_run_workload.py \
133+
--scope mr \
134+
--model ${{ inputs.model }} \
135+
--test-case ${{ inputs.test_case }} \
136+
--environment dev \
137+
--platform dgx_h100 \
138+
--container-image ${{ inputs.container-image }} \
139+
--data-dir /mnt/datadrive/TestData/megatron-lm/artifacts \
140+
--enable-lightweight-mode
172141
173142
RUN_TEST_EOF
174143
)
175144
echo "$cmd" | tee "job.sh"
176145
echo "::endgroup::"
177146
178-
- name: Build container
179-
shell: bash
180-
env:
181-
GH_TOKEN: ${{ inputs.PAT }}
182-
run: |
183-
echo "::group::Build test container"
184-
docker build -f docker/Dockerfile.ci.dev --build-arg FROM_IMAGE_NAME="nvcr.io/nvidia/pytorch:25.06-py3" --target=main -t megatron-core .
185-
echo "::endgroup::"
186-
187-
- name: Start container
188-
shell: bash
189-
run: |
190-
echo "::group::Start test container"
191-
set -x
192-
193-
cmd=$(cat <<RUN_TEST_EOF
194-
#!/bin/bash
195-
docker container rm -f test_container_${{ github.run_id }} || true
196-
docker run \
197-
--rm \
198-
-d \
199-
--name test_container_${{ github.run_id }} \
200-
--runtime=nvidia --gpus all \
201-
--shm-size=64g \
202-
--ipc=host \
203-
-e NCCL_IB_DISABLE=1 \
204-
-e NCCL_P2P_LEVEL=NVL \
205-
--workdir /opt/megatron-lm/ \
206-
-v /home/runner/_work/TestData/megatron-lm/artifacts/text/data/:/workspace/data \
207-
--volume ${{ github.workspace }}/Megatron-LM:/opt/megatron-lm/ \
208-
$VOLUME_ARGS \
209-
megatron-core \
210-
bash -c "sleep $(( ${{ inputs.timeout }} * 60 + 60 ))"
211-
RUN_TEST_EOF
212-
)
213-
214-
echo "$cmd" | tee "retry_job.sh"
215-
bash retry_job.sh
216-
echo "::endgroup::"
217-
218147
- name: Set timeout
219-
shell: bash
148+
shell: bash -x -e -u -o pipefail {0}
220149
id: timeout_in_seconds
221150
run: |
222151
echo "::group::Set timeout"
223152
echo "main=$(( ${{ inputs.timeout }} * 60 ))" | tee -a "$GITHUB_OUTPUT"
224153
echo "::endgroup::"
225154
155+
- name: Pull container
156+
shell: bash -x -e -u -o pipefail {0}
157+
run: |
158+
echo "::group::Pull container"
159+
docker pull ${{ inputs.container-image }}
160+
echo "::endgroup::"
161+
226162
- name: Run main script
227-
uses: nick-fields/retry@v3
163+
shell: bash -x -e -u -o pipefail {0}
228164
id: run-main-script
229-
with:
230-
timeout_seconds: ${{ steps.timeout_in_seconds.outputs.main }}
231-
max_attempts: 3
232-
shell: bash
233-
retry_on: any
234-
command: /bin/bash job.sh
235-
on_retry_command: /bin/bash retry_job.sh
165+
run: |
166+
echo "::group::Run main script"
167+
EXIT_CODE=0
168+
/bin/bash job.sh || EXIT_CODE=$?
169+
echo "exit_code=$EXIT_CODE" | tee -a "$GITHUB_OUTPUT"
170+
exit $EXIT_CODE
171+
echo "::endgroup::"
236172
237173
- name: Check result
238174
id: check
239-
shell: bash
175+
shell: bash -x -e -u -o pipefail {0}
176+
if: always()
240177
env:
241178
IS_UNIT_TEST: ${{ inputs.is_unit_test == 'true' }}
242179
run: |
243180
echo "::group::Check result"
244181
245-
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/outputs/logs ./
246182
logs_report=logs-${{ inputs.test_case }}-${{ github.run_id }}-$(uuidgen)
247183
echo "logs_report=$logs_report" | sed 's/\//-/g' | sed 's/\*/-/g' | tee -a "$GITHUB_OUTPUT"
248184
249185
if [[ "$IS_UNIT_TEST" == "true" ]]; then
250-
docker exec test_container_${{ github.run_id }} /opt/venv/bin/coverage xml
251-
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/.coverage .coverage
252-
docker cp test_container_${{ github.run_id }}:/opt/megatron-lm/coverage.xml coverage.xml
253186
coverage_report=coverage-${{ inputs.is_unit_test == 'true' && 'unit-test' || 'e2e' }}-${{ github.run_id }}-$(uuidgen)
254187
else
255188
coverage_report=none
@@ -267,16 +200,18 @@ runs:
267200
if [[ "$IS_SUCCESS" == "false" ]]; then
268201
echo Test did not finish successfully.
269202
exit 1
270-
else
271-
docker exec -t test_container_${{ github.run_id }} /opt/venv/bin/coverage report -i
203+
fi
204+
205+
if [[ "$coverage_report" != "none" ]]; then
206+
uv run coverage report -i
272207
fi
273208
274209
exit $EXIT_CODE
275210
echo "::endgroup::"
276211
277212
- name: Upload coverage
278213
uses: actions/upload-artifact@v4
279-
if: ${{ steps.check.outputs.coverage_report != 'none' }}
214+
if: ${{ always() && steps.check.outputs.coverage_report != 'none' }}
280215
with:
281216
name: ${{ steps.check.outputs.coverage_report }}
282217
path: |
@@ -286,13 +221,8 @@ runs:
286221

287222
- name: Upload logs
288223
uses: actions/upload-artifact@v4
224+
if: always()
289225
with:
290226
name: ${{ steps.check.outputs.logs_report }}
291-
path: logs
227+
path: ${{ inputs.is_unit_test == 'true' && 'logs' || 'assets_dir' }}
292228
include-hidden-files: true
293-
294-
- name: Container shutdown
295-
if: always()
296-
shell: bash
297-
run: |
298-
docker container rm -f test_container_${{ github.run_id }} || true

0 commit comments

Comments
 (0)