Skip to content

Commit 7020e1f

Browse files
authored
[Dev] Add more tests for LayerwiseDistOpt with dist_ckpt (#2132)
Signed-off-by: Boxiang Wang <[email protected]>
1 parent 8427584 commit 7020e1f

File tree

5 files changed

+815
-19
lines changed

5 files changed

+815
-19
lines changed
Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,66 @@
1+
ENV_VARS:
2+
CUDA_DEVICE_MAX_CONNECTIONS: 1
3+
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
4+
NCCL_ALGO: Ring
5+
CUBLAS_WORKSPACE_CONFIG: :4096:8
6+
MODEL_ARGS:
7+
--num-layers: 12
8+
--hidden-size: 512
9+
--num-attention-heads: 8
10+
--log-params-norm: true
11+
--log-num-zeros-in-grad: true
12+
--log-validation-ppl-to-tensorboard: true
13+
--log-timers-to-tensorboard: true
14+
--tensorboard-dir: ${TENSORBOARD_PATH}
15+
--micro-batch-size: 4
16+
--global-batch-size: 32
17+
--seq-length: 1024
18+
--max-position-embeddings: 1024
19+
--disable-bias-linear: true
20+
--train-iters: 100
21+
--timing-log-level: 0
22+
--lr-decay-iters: 320000
23+
--save: ${CHECKPOINT_SAVE_PATH}
24+
--load: ${CHECKPOINT_LOAD_PATH}
25+
--data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
26+
--vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
27+
--merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
28+
--split: 949,50,1
29+
--distributed-backend: nccl
30+
--lr: 0.00015
31+
--lr-decay-style: cosine
32+
--min-lr: 1.0e-5
33+
--weight-decay: 1e-2
34+
--clip-grad: 1.0
35+
--lr-warmup-fraction: .01
36+
--log-interval: 1
37+
--save-interval: 50
38+
--eval-interval: 1000
39+
--eval-iters: 10
40+
--transformer-impl: transformer_engine
41+
--tensor-model-parallel-size: 1
42+
--pipeline-model-parallel-size: 1
43+
--expert-model-parallel-size: 8
44+
--num-experts: 8
45+
--moe-token-dispatcher-type: allgather
46+
--moe-router-load-balancing-type: aux_loss
47+
--moe-router-topk: 2
48+
--moe-router-dtype: fp32
49+
--moe-ffn-hidden-size: 1024
50+
--moe-grouped-gemm: true
51+
--ckpt-fully-parallel-load: true
52+
--deterministic-mode: true
53+
--no-gradient-accumulation-fusion: true
54+
--attention-softmax-in-fp32: true
55+
--use-checkpoint-opt_param-scheduler: true
56+
--use-mcore-models: true
57+
--ckpt-format: torch_dist
58+
--data-cache-path: ${DATA_CACHE_PATH}
59+
--bf16: true
60+
--no-bias-gelu-fusion: true
61+
--log-memory-to-tensorboard: true
62+
--optimizer: dist_muon
63+
--muon-momentum: 0.9
64+
--muon-extra-scale-factor: 0.2
65+
--muon-scale-mode: spectral
66+
TEST_TYPE: ckpt-resume
Lines changed: 67 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,67 @@
1+
ENV_VARS:
2+
CUDA_DEVICE_MAX_CONNECTIONS: 1
3+
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
4+
NCCL_ALGO: Ring
5+
CUBLAS_WORKSPACE_CONFIG: :4096:8
6+
MODEL_ARGS:
7+
--num-layers: 12
8+
--hidden-size: 512
9+
--num-attention-heads: 8
10+
--log-params-norm: true
11+
--log-num-zeros-in-grad: true
12+
--log-validation-ppl-to-tensorboard: true
13+
--log-timers-to-tensorboard: true
14+
--tensorboard-dir: ${TENSORBOARD_PATH}
15+
--micro-batch-size: 4
16+
--global-batch-size: 32
17+
--seq-length: 1024
18+
--max-position-embeddings: 1024
19+
--disable-bias-linear: true
20+
--train-iters: 100
21+
--timing-log-level: 0
22+
--lr-decay-iters: 320000
23+
--save: ${CHECKPOINT_SAVE_PATH}
24+
--load: ${CHECKPOINT_LOAD_PATH}
25+
--data-path: ${DATA_PATH}/text/the_pile/shard00/my-gpt3_00_text_document
26+
--vocab-file: ${DATA_PATH}/text/the_pile/shard00/bpe/vocab.json
27+
--merge-file: ${DATA_PATH}/text/the_pile/shard00/bpe/merges.txt
28+
--split: 949,50,1
29+
--distributed-backend: nccl
30+
--lr: 0.00015
31+
--lr-decay-style: cosine
32+
--min-lr: 1.0e-5
33+
--weight-decay: 1e-2
34+
--clip-grad: 1.0
35+
--lr-warmup-fraction: .01
36+
--log-interval: 1
37+
--save-interval: 50
38+
--eval-interval: 1000
39+
--eval-iters: 10
40+
--transformer-impl: transformer_engine
41+
--tensor-model-parallel-size: 1
42+
--pipeline-model-parallel-size: 1
43+
--expert-model-parallel-size: 8
44+
--num-experts: 8
45+
--moe-token-dispatcher-type: allgather
46+
--moe-router-load-balancing-type: aux_loss
47+
--moe-router-topk: 2
48+
--moe-router-dtype: fp32
49+
--moe-ffn-hidden-size: 1024
50+
--moe-grouped-gemm: true
51+
--ckpt-fully-parallel-load: true
52+
--deterministic-mode: true
53+
--no-gradient-accumulation-fusion: true
54+
--attention-softmax-in-fp32: true
55+
--use-checkpoint-opt_param-scheduler: true
56+
--use-mcore-models: true
57+
--ckpt-format: torch_dist
58+
--ckpt-assume-constant-structure: true
59+
--data-cache-path: ${DATA_CACHE_PATH}
60+
--bf16: true
61+
--no-bias-gelu-fusion: true
62+
--log-memory-to-tensorboard: true
63+
--optimizer: muon
64+
--muon-momentum: 0.9
65+
--muon-extra-scale-factor: 0.2
66+
--muon-scale-mode: spectral
67+
TEST_TYPE: ckpt-resume

tests/test_utils/recipes/moe.yaml

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,16 @@ products:
121121
- environment: [dev]
122122
scope: [mr, mr-github]
123123
platforms: [dgx_h100]
124+
- test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_dist_muon]
125+
products:
126+
- environment: [dev]
127+
scope: [mr, mr-github, mr-slim]
128+
platforms: [dgx_h100]
129+
- test_case: [gpt3_moe_mcore_te_ep8_resume_torch_dist_muon]
130+
products:
131+
- environment: [dev]
132+
scope: [mr, mr-github, mr-slim]
133+
platforms: [dgx_h100]
124134
- test_case: [gpt3_moe_mcore_te_tp2_pp2_ep4_etp1_no_mtp_no_a2a_ovlp_fine_grained_offloading]
125135
products:
126136
- environment: [dev]

0 commit comments

Comments
 (0)