Skip to content

Commit 5cbe81d

Browse files
authored
Merge pull request #83 from NVIDIA/slym/gc_interval
gc_interval fix for the case validation interval is shorter than gc_interval
2 parents e62f9e5 + 17b7971 commit 5cbe81d

File tree

9 files changed

+3
-8
lines changed

9 files changed

+3
-8
lines changed

launcher_scripts/conf/training/gpt3/126m.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,6 @@ model:
128128
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
129129
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
130130
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
131-
gc_interval: 100 # Interval of the host memory garbage collection
132131

133132
# Nsys profiling options
134133
nsys_profile:

launcher_scripts/conf/training/gpt3/1b_improved.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ model:
114114
num_micro_batches_with_partial_activation_checkpoints: null
115115
activations_checkpoint_layers_per_pipeline: null
116116
sequence_parallel: false # does not support sequence parallel
117-
gc_interval: 100 # Interval of the host memory garbage collection
118117

119118
## Transformer Engine
120119
# fp8 training is currently not supported in the improved models

launcher_scripts/conf/training/gpt3/20b.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ model:
127127
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
128128
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
129129
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
130-
gc_interval: 100 # Interval of the host memory garbage collection
131130

132131
# Nsys profiling options
133132
nsys_profile:

launcher_scripts/conf/training/gpt3/400m_improved.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ model:
114114
num_micro_batches_with_partial_activation_checkpoints: null
115115
activations_checkpoint_layers_per_pipeline: null
116116
sequence_parallel: false # does not support sequence parallel
117-
gc_interval: 100 # Interval of the host memory garbage collection
118117

119118
## Transformer Engine
120119
# fp8 training is currently not supported in the improved models

launcher_scripts/conf/training/gpt3/40b.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ model:
127127
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
128128
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
129129
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
130-
gc_interval: 100 # Interval of the host memory garbage collection
131130

132131
# Nsys profiling options
133132
nsys_profile:

launcher_scripts/conf/training/gpt3/40b_improved.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ model:
114114
num_micro_batches_with_partial_activation_checkpoints: null
115115
activations_checkpoint_layers_per_pipeline: null
116116
sequence_parallel: false # does not support sequence parallel
117-
gc_interval: 100 # Interval of the host memory garbage collection
118117

119118
## Transformer Engine
120119
# fp8 training is currently not supported in the improved models

launcher_scripts/conf/training/gpt3/5b.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,6 @@ model:
127127
use_cpu_initialization: False # Init weights on the CPU (slow for large models)
128128
onnx_safe: False # Use work-arounds for known problems with Torch ONNX exporter.
129129
apex_transformer_log_level: 30 # Python logging level displays logs with severity greater than or equal to this
130-
gc_interval: 100 # Interval of the host memory garbage collection
131130

132131
# Nsys profiling options
133132
nsys_profile:

launcher_scripts/conf/training/gpt3/7b_improved.yaml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,6 @@ model:
114114
num_micro_batches_with_partial_activation_checkpoints: null
115115
activations_checkpoint_layers_per_pipeline: null
116116
sequence_parallel: false # does not support sequence parallel
117-
gc_interval: 100 # Interval of the host memory garbage collection
118117

119118
## Transformer Engine
120119
# fp8 training is currently not supported in the improved models

launcher_scripts/nemo_launcher/core/stages.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -523,6 +523,9 @@ def _make_hydra_override(self) -> List:
523523
if self.stage_cfg.model.get("ub_tp_comm_overlap", False):
524524
get_ub_cfg_file_command = self._get_ub_cfg_file()
525525
hydra_override += [f"+model.ub_tp_comm_overlap_cfg=\$({get_ub_cfg_file_command})"]
526+
if self.stage_cfg.model.get("gc_interval", 0) > 1:
527+
gc_interval = min(self.stage_cfg.model.get("gc_interval"), self.cfg.training.trainer.get("val_check_interval"))
528+
hydra_override += [f"model.gc_interval={gc_interval}"]
526529
return hydra_override
527530

528531
def _get_nemo_code_path(self, model_type: str) -> Path:

0 commit comments

Comments
 (0)