You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: launcher_scripts/conf/training/nemotron/nemotron_340b.yaml
+13-5Lines changed: 13 additions & 5 deletions
Original file line number
Diff line number
Diff line change
@@ -58,8 +58,8 @@ model:
58
58
rampup_batch_size: null
59
59
context_parallel_size: 1
60
60
tensor_model_parallel_size: 8
61
-
pipeline_model_parallel_size: 12
62
-
virtual_pipeline_model_parallel_size: 8
61
+
pipeline_model_parallel_size: 8
62
+
virtual_pipeline_model_parallel_size: 12
63
63
encoder_seq_length: 4096
64
64
max_position_embeddings: 4096
65
65
num_layers: 96
@@ -131,9 +131,17 @@ model:
131
131
fsdp_sharding_strategy: 'full'# Method to shard model states. Available options are 'full', 'hybrid', and 'grad'.
132
132
fsdp_grad_reduce_dtype: 32# Gradient reduction data type.
133
133
fsdp_sharded_checkpoint: False # Store and load FSDP shared checkpoint.
134
+
135
+
defer_embedding_wgrad_compute: True
136
+
wgrad_deferral_limit: 22
137
+
cross_entropy_loss_fusion: True
138
+
enable_vboost: True
139
+
ub_tp_comm_overlap: True
140
+
apply_rope_fusion: True
141
+
deteministic_mode: False
142
+
overlap_p2p_comm: True # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
143
+
batch_p2p_comm: False # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
134
144
135
-
overlap_p2p_comm: False # Overlap p2p communication with computes. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
136
-
batch_p2p_comm: True # Batch consecutive inter-peer send/recv operations. This argument is valid only when `virtual_pipeline_model_parallel_size` is larger than 1
137
145
num_query_groups: 8# Number of query groups for group query attention. If None, normal attention is used.
0 commit comments