Merge pull request #46 from NVIDIA/new_configs

Davood-M · web-flow · commit 5275af79afc1 · 2023-05-11T15:35:01.000-07:00
add changes for new configs
diff --git a/launcher_scripts/conf/training/gpt3/2b.yaml b/launcher_scripts/conf/training/gpt3/2b.yaml
@@ -47,7 +47,7 @@ exp_manager:
     buffer_size: 5
 
 model:
-  micro_batch_size: 4
+  micro_batch_size: 2
   global_batch_size: 512
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
diff --git a/launcher_scripts/conf/training/gpt3/43b.yaml b/launcher_scripts/conf/training/gpt3/43b.yaml
@@ -13,7 +13,7 @@ trainer:
   enable_checkpointing: False
   replace_sampler_ddp: False
   max_epochs: null
-  max_steps: 200000 # consumed_samples = global_step * global_batch_size
+  max_steps: 100000 # consumed_samples = global_step * global_batch_size
   max_time: "6:11:00:00" # days:hours:minutes:seconds
   log_every_n_steps: 10
   val_check_interval: 2000
@@ -48,9 +48,9 @@ exp_manager:
 
 model:
   micro_batch_size: 2
-  global_batch_size: 768
-  tensor_model_parallel_size: 4
-  pipeline_model_parallel_size: 4
+  global_batch_size: 1536
+  tensor_model_parallel_size: 8
+  pipeline_model_parallel_size: 2
   virtual_pipeline_model_parallel_size: null
   encoder_seq_length: 2048
   max_position_embeddings: 2048
diff --git a/launcher_scripts/conf/training/gpt3/843m.yaml b/launcher_scripts/conf/training/gpt3/843m.yaml
@@ -47,7 +47,7 @@ exp_manager:
     buffer_size: 5
 
 model:
-  micro_batch_size: 2
+  micro_batch_size: 4
   global_batch_size: 256
   tensor_model_parallel_size: 1
   pipeline_model_parallel_size: 1
@@ -207,4 +207,4 @@ model:
       - .0333
       - ${data_dir}/my-gpt3_28_text_document
       - .0334
-      - ${data_dir}/my-gpt3_29_text_document
+      - ${data_dir}/my-gpt3_29_text_document
diff --git a/launcher_scripts/conf/training/gpt3/8b.yaml b/launcher_scripts/conf/training/gpt3/8b.yaml
@@ -47,7 +47,7 @@ exp_manager:
     buffer_size: 5
 
 model:
-  micro_batch_size: 4
+  micro_batch_size: 2
   global_batch_size: 512
   tensor_model_parallel_size: 2
   pipeline_model_parallel_size: 1