Skip to content

Commit 5730fac

Browse files
authored
Merge pull request #450 from NVIDIA/zhenghax-recipe_updae_improve_mfu
Update nemotron_340b.yaml
2 parents b09c6a6 + 630e654 commit 5730fac

File tree

1 file changed

+13
-5
lines changed

1 file changed

+13
-5
lines changed

launcher_scripts/conf/training/nemotron/nemotron_340b.yaml

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ model:
5858
rampup_batch_size: null
5959
context_parallel_size: 1
6060
tensor_model_parallel_size: 8
61-
pipeline_model_parallel_size: 12
62-
virtual_pipeline_model_parallel_size: 8
61+
pipeline_model_parallel_size: 8
62+
virtual_pipeline_model_parallel_size: 12
6363
encoder_seq_length: 4096
6464
max_position_embeddings: 4096
6565
num_layers: 96
@@ -131,9 +131,17 @@ model:
131131
fsdp_sharding_strategy: 'full' # Method to shard model states. Available options are 'full', 'hybrid', and 'grad'.
132132
fsdp_grad_reduce_dtype: 32 # Gradient reduction data type.
133133
fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
134+
135+
defer_embedding_wgrad_compute: True
136+
wgrad_deferral_limit: 22
137+
cross_entropy_loss_fusion: True
138+
enable_vboost: True
139+
ub_tp_comm_overlap: True
140+
apply_rope_fusion: True
141+
deteministic_mode: False
142+
overlap_p2p_comm: True # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
143+
batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
134144

135-
overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
136-
batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
137145
num_query_groups: 8 # Number of query groups for group query attention. If None, normal attention is used.
138146

139147
## Network
@@ -188,4 +196,4 @@ model:
188196
- .0333
189197
- ${data_dir}/my-nemotron_00_text_document
190198
- .0333
191-
- ${data_dir}/my-nemotron_00_text_document
199+
- ${data_dir}/my-nemotron_00_text_document

0 commit comments

Comments
 (0)