Skip to content

Commit 658931e

Browse files
ci: Create weekly dev branch (NVIDIA#2223)
Signed-off-by: oliver könig <[email protected]> Signed-off-by: GitHub Actions <github-actions[bot]@users.noreply.github.com> Co-authored-by: GitHub Actions <github-actions[bot]@users.noreply.github.com>
1 parent 693587d commit 658931e

File tree

8 files changed

+20249
-7
lines changed

8 files changed

+20249
-7
lines changed

.gitlab/stages/00.pre.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ pre:create_ci_branches_dev:
6868
- branch: ci-dev-rebuild-mcore-nemo-image
6969
- branch: ci-dev-mr
7070
- branch: ci-dev-nightly
71+
- branch: ci-dev-weekly
7172
- branch: ci-dev-upgrade-dependencies
7273
tags:
7374
- arch/amd64

.gitlab/stages/04.functional-tests.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ functional:configure:
5353
- |
5454
RECORD_CHECKPOINTS=$([[ "$CI_MERGE_REQUEST_LABELS" == *"Record checkpoints"* || "$FUNCTIONAL_TEST_RECORD_CHECKPOINTS" == "yes" ]] && echo "true" || echo "false")
5555
- |
56-
if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "pre-release" ]]; then
56+
if [[ "$FUNCTIONAL_TEST_SCOPE" == "release" || "$FUNCTIONAL_TEST_SCOPE" == "weekly" ]]; then
5757
FUNCTIONAL_TEST_NAME=$(eval echo $FUNCTIONAL_TEST_NAME)
5858
RELEASE_ARGS=(
5959
"--run-name"

tests/functional_tests/test_cases/gpt/gpt3_15b_8t_release_sm/model_config.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ MODEL_ARGS:
2323
--micro-batch-size: 4
2424
--rampup-batch-size: "[384 384 97656250]"
2525
--global-batch-size: 1152
26-
--train-samples: 4882812
26+
--train-samples: 19531250
2727
--manual-gc: true
2828
# Transformer Engine args
2929
--transformer-impl: transformer_engine
@@ -68,9 +68,10 @@ MODEL_ARGS:
6868
--eval-iters: 32
6969
--eval-interval: 2000
7070
# Add checkpointing args
71-
--load: ${CHECKPOINT_LOAD_PATH}
7271
--save: ${CHECKPOINT_SAVE_PATH}
73-
--save-interval: 1000
72+
--load: ${CHECKPOINT_LOAD_PATH}
73+
--save-interval: 5000
74+
--save-retain-interval: 10000
7475
# Add initialization args
7576
--init-method-std: 0.0134
7677
# Add logging args
@@ -86,7 +87,7 @@ MODEL_ARGS:
8687
--wandb-exp-name: ${WANDB_EXPERIMENT}
8788
# Add mixed precision args
8889
--bf16: true
89-
--exit-interval: 13000
90+
--exit-interval: 10200
9091
--wandb-save-dir: ${WANDB_SAVE_PATH}
9192
METRICS:
9293
- "iteration-time"

tests/functional_tests/test_cases/gpt/gpt3_weekly_dgx_b200_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/model_config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ MODEL_ARGS:
4141
--pipeline-model-parallel-size: 2
4242
--sequence-parallel: true
4343
--tp-comm-overlap: true
44-
--tp-comm-overlap-cfg: tests/functional_tests/test_cases/gpt/gpt3_345m_weekly_dgx_b200_1N8G_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml
44+
--tp-comm-overlap-cfg: tests/functional_tests/test_cases/gpt/gpt3_mcore_tp2_pp2_current_scaling_native_fp8_tp_pp_sp_tp_overlap/tp_comm_overlap_cfg.yaml
4545
--deterministic-mode: true
4646
--no-gradient-accumulation-fusion: true
4747
--fp8-format: hybrid

0 commit comments

Comments
 (0)