Merge pull request #83 from NVIDIA/slym/gc_interval

Davood-M · web-flow · commit 5cbe81d07c5c · 2023-06-14T10:14:13.000-07:00
gc_interval fix for the case validation interval is shorter than gc_interval
diff --git a/launcher_scripts/conf/training/gpt3/126m.yaml b/launcher_scripts/conf/training/gpt3/126m.yaml
@@ -128,7 +128,6 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/1b_improved.yaml b/launcher_scripts/conf/training/gpt3/1b_improved.yaml
@@ -114,7 +114,6 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
diff --git a/launcher_scripts/conf/training/gpt3/20b.yaml b/launcher_scripts/conf/training/gpt3/20b.yaml
@@ -127,7 +127,6 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/400m_improved.yaml b/launcher_scripts/conf/training/gpt3/400m_improved.yaml
@@ -114,7 +114,6 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
diff --git a/launcher_scripts/conf/training/gpt3/40b.yaml b/launcher_scripts/conf/training/gpt3/40b.yaml
@@ -127,7 +127,6 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/40b_improved.yaml b/launcher_scripts/conf/training/gpt3/40b_improved.yaml
@@ -114,7 +114,6 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
-  gc_interval: 100 # Interval of the host memory garbage collection
   
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
diff --git a/launcher_scripts/conf/training/gpt3/5b.yaml b/launcher_scripts/conf/training/gpt3/5b.yaml
@@ -127,7 +127,6 @@ model:
   use_cpu_initialization: False # Init weights on the CPU (slow for large models)
   onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
   apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   # Nsys profiling options
   nsys_profile:
diff --git a/launcher_scripts/conf/training/gpt3/7b_improved.yaml b/launcher_scripts/conf/training/gpt3/7b_improved.yaml
@@ -114,7 +114,6 @@ model:
   num_micro_batches_with_partial_activation_checkpoints: null
   activations_checkpoint_layers_per_pipeline: null
   sequence_parallel: false # does not support sequence parallel
-  gc_interval: 100 # Interval of the host memory garbage collection
 
   ## Transformer Engine
   # fp8 training is currently not supported in the improved models
diff --git a/launcher_scripts/nemo_launcher/core/stages.py b/launcher_scripts/nemo_launcher/core/stages.py
@@ -523,6 +523,9 @@ def _make_hydra_override(self) -> List:
         if self.stage_cfg.model.get("ub_tp_comm_overlap", False):
             get_ub_cfg_file_command = self._get_ub_cfg_file()
             hydra_override += [f"+model.ub_tp_comm_overlap_cfg=\$({get_ub_cfg_file_command})"]
+        if self.stage_cfg.model.get("gc_interval", 0) > 1:
+            gc_interval = min(self.stage_cfg.model.get("gc_interval"), self.cfg.training.trainer.get("val_check_interval"))
+            hydra_override += [f"model.gc_interval={gc_interval}"]
         return hydra_override
 
     def _get_nemo_code_path(self, model_type: str) -> Path: