File tree Expand file tree Collapse file tree 9 files changed +3
-8
lines changed Expand file tree Collapse file tree 9 files changed +3
-8
lines changed Original file line number Diff line number Diff line change @@ -128,7 +128,6 @@ model:
128128 use_cpu_initialization : False # Init weights on the CPU (slow for large models)
129129 onnx_safe : False # Use work-arounds for known problems with Torch ONNX exporter.
130130 apex_transformer_log_level : 30 # Python logging level displays logs with severity greater than or equal to this
131- gc_interval : 100 # Interval of the host memory garbage collection
132131
133132 # Nsys profiling options
134133 nsys_profile :
Original file line number Diff line number Diff line change @@ -114,7 +114,6 @@ model:
114114 num_micro_batches_with_partial_activation_checkpoints : null
115115 activations_checkpoint_layers_per_pipeline : null
116116 sequence_parallel : false # does not support sequence parallel
117- gc_interval : 100 # Interval of the host memory garbage collection
118117
119118 # # Transformer Engine
120119 # fp8 training is currently not supported in the improved models
Original file line number Diff line number Diff line change @@ -127,7 +127,6 @@ model:
127127 use_cpu_initialization : False # Init weights on the CPU (slow for large models)
128128 onnx_safe : False # Use work-arounds for known problems with Torch ONNX exporter.
129129 apex_transformer_log_level : 30 # Python logging level displays logs with severity greater than or equal to this
130- gc_interval : 100 # Interval of the host memory garbage collection
131130
132131 # Nsys profiling options
133132 nsys_profile :
Original file line number Diff line number Diff line change @@ -114,7 +114,6 @@ model:
114114 num_micro_batches_with_partial_activation_checkpoints : null
115115 activations_checkpoint_layers_per_pipeline : null
116116 sequence_parallel : false # does not support sequence parallel
117- gc_interval : 100 # Interval of the host memory garbage collection
118117
119118 # # Transformer Engine
120119 # fp8 training is currently not supported in the improved models
Original file line number Diff line number Diff line change @@ -127,7 +127,6 @@ model:
127127 use_cpu_initialization : False # Init weights on the CPU (slow for large models)
128128 onnx_safe : False # Use work-arounds for known problems with Torch ONNX exporter.
129129 apex_transformer_log_level : 30 # Python logging level displays logs with severity greater than or equal to this
130- gc_interval : 100 # Interval of the host memory garbage collection
131130
132131 # Nsys profiling options
133132 nsys_profile :
Original file line number Diff line number Diff line change @@ -114,7 +114,6 @@ model:
114114 num_micro_batches_with_partial_activation_checkpoints : null
115115 activations_checkpoint_layers_per_pipeline : null
116116 sequence_parallel : false # does not support sequence parallel
117- gc_interval : 100 # Interval of the host memory garbage collection
118117
119118 # # Transformer Engine
120119 # fp8 training is currently not supported in the improved models
Original file line number Diff line number Diff line change @@ -127,7 +127,6 @@ model:
127127 use_cpu_initialization : False # Init weights on the CPU (slow for large models)
128128 onnx_safe : False # Use work-arounds for known problems with Torch ONNX exporter.
129129 apex_transformer_log_level : 30 # Python logging level displays logs with severity greater than or equal to this
130- gc_interval : 100 # Interval of the host memory garbage collection
131130
132131 # Nsys profiling options
133132 nsys_profile :
Original file line number Diff line number Diff line change @@ -114,7 +114,6 @@ model:
114114 num_micro_batches_with_partial_activation_checkpoints : null
115115 activations_checkpoint_layers_per_pipeline : null
116116 sequence_parallel : false # does not support sequence parallel
117- gc_interval : 100 # Interval of the host memory garbage collection
118117
119118 # # Transformer Engine
120119 # fp8 training is currently not supported in the improved models
Original file line number Diff line number Diff line change @@ -523,6 +523,9 @@ def _make_hydra_override(self) -> List:
523523 if self .stage_cfg .model .get ("ub_tp_comm_overlap" , False ):
524524 get_ub_cfg_file_command = self ._get_ub_cfg_file ()
525525 hydra_override += [f"+model.ub_tp_comm_overlap_cfg=\$({ get_ub_cfg_file_command } )" ]
526+ if self .stage_cfg .model .get ("gc_interval" , 0 ) > 1 :
527+ gc_interval = min (self .stage_cfg .model .get ("gc_interval" ), self .cfg .training .trainer .get ("val_check_interval" ))
528+ hydra_override += [f"model.gc_interval={ gc_interval } " ]
526529 return hydra_override
527530
528531 def _get_nemo_code_path (self , model_type : str ) -> Path :
You can’t perform that action at this time.
0 commit comments