From 78e8098f138fa0f72ca11cb6953aa4e412a6d16a Mon Sep 17 00:00:00 2001 From: AGupta41 Date: Fri, 13 Jun 2025 19:11:51 +0000 Subject: [PATCH 1/4] Added a function to list MuP arguments --- megatron/arguments.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/megatron/arguments.py b/megatron/arguments.py index 9d9d52284d7..b240e6dcd9e 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -46,6 +46,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): parser = _add_transformer_engine_args(parser) parser = _add_retro_args(parser) parser = _add_profiler_args(parser) + parser = _add_mup_args(parser) # Custom arguments. if extra_args_provider is not None: @@ -500,6 +501,43 @@ def core_transformer_config_from_args(args): return TransformerConfig(**kw_args) +def _add_mup_args(parser): + group = parser.add_argument_group(title='MuP') + + group.add_argument('--enable-mup', type=bool, default=False, + help='Set True to use MuP', dest='enable-mup') + + + group.add_argument('--mup-coord-check', type=bool, default=False, + help='Perform coordinate check for MuP', dest='mup-coord-check') + + group.add_argument('--mup-input-weights-scale', type=float, default=1.0, + help='Scalar to multiply initial weights', dest='mup-input-weights-scale') + + + group.add_argument('--mup-hidden-weights-scale', type=float, default=1.0, + help='Scalar to multiply hidden weights', dest='mup-hidden-weights-scale') + + group.add_argument('--mup-output-weights-scale', type=float, default=1.0, + help='Scalar to multiply output weights', dest='mup-output-weights-scale') + + + group.add_argument('--mup-input-lr-scale', type=float, default=1.0, + help='To scale learning rate for input weights', dest='mup-input-lr-scale') + + + group.add_argument('--mup-hidden-lr-scale', type=float, default=1.0, + help='To scale learning rate for hidden weights', dest='mup-hidden-lr-scale') + + + group.add_argument('--mup-output-lr-scale', type=float, default=1.0, + help='To scale learning rate for output weights', dest='mup-output-lr-scale') + + return parser + + + + def _add_transformer_engine_args(parser): group = parser.add_argument_group(title='Transformer-Engine') From 52603273ed9f25db5298d51a08bac8d66f21bca5 Mon Sep 17 00:00:00 2001 From: AGupta41 Date: Thu, 26 Jun 2025 14:22:59 +0000 Subject: [PATCH 2/4] Temporary update: Added MuP and depth scaling variables as arguments; and to the optimizer files to create new param groups --- ALCF/helpers.sh | 25 +++++++++ megatron/arguments.py | 45 ++++++++++++---- .../core/transformer/transformer_config.py | 22 ++++++++ megatron/model/language_model.py | 8 ++- megatron/model/utils.py | 2 +- megatron/optimizer/__init__.py | 54 ++++++++++++++++--- megatron/optimizer_param_scheduler.py | 11 +++- 7 files changed, 147 insertions(+), 20 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index f42f331af59..e6c50af3401 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -222,6 +222,28 @@ setup_run_cmd() { if [[ -z "${OVERRIDE_CKPT_OPT_PARAM:-}" ]]; then train_args+=("--use-checkpoint-opt_param-scheduler") fi + + # Add MuP to the model + export MUP_BASE_WIDTH=${MUP_BASE_WIDTH:-256} + export MUP_MUL=$(( $HIDDEN / $MUP_BASE_WIDTH )) + mup_flags+=( + "--enable-mup" + "--mup-coord-check=True" + "--mup-hidden-weights-scale=${MUP_MUL}" + "--mup-hidden-lr-scale=${MUP_MUL}" + ) + + + # Add depth scaling to the model + export DEPTH_BASE=${DEPTH_BASE:-2} + export DEPTH_MUL=$(( $NLAYERS / $DEPTH_BASE )) + depth_scaling_flags+=( + "--enable-depth-scale" + "--depth-base=${DEPTH_BASE}" + "--depth-multiplier=${DEPTH_MUL}" + "--depth-alpha=0.5") + + # "--init-method-std ${INIT_METHOD_STD:-0.0006}" # "--shuffle-sample" train_args+=( @@ -271,6 +293,9 @@ setup_run_cmd() { "--num-attention-heads=${HEADS}" "--data-cache-path=${data_cache_path}" "--data-file-list=${DATA_FILE_LIST:-${dfl_fallback}}" + # add MUP parameters + "${mup_flags[@]}" + "${depth_scaling_flags[@]}" ) # "--adam-eps ${ADAM_EPS:-0.00001}" cache_dir="${PBS_O_WORKDIR}/.cache/" diff --git a/megatron/arguments.py b/megatron/arguments.py index b240e6dcd9e..63abf37cf0a 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -47,6 +47,7 @@ def parse_args(extra_args_provider=None, ignore_unknown_args=False): parser = _add_retro_args(parser) parser = _add_profiler_args(parser) parser = _add_mup_args(parser) + parser = _add_depth_scaling_args(parser) # Custom arguments. if extra_args_provider is not None: @@ -501,38 +502,64 @@ def core_transformer_config_from_args(args): return TransformerConfig(**kw_args) +def _add_depth_scaling_args(parser): + + group = parser.add_argument_group(title='Depth_Scaling') + + group.add_argument('--enable-depth-scale', action='store_true', + help='Include in cmd to implement parameterization for model depth scaling', dest='enable_depth_scale') + + #group.add_argument('--depth_scaling_enabled', type=bool, default=False, + # help='Include in cmd to implement parameterization for model depth scaling', dest='depth_scaling_enabled') + + + group.add_argument('--depth-base', type=int, default=1, + help='Specify number of layers in base model', dest='depth_base') + + group.add_argument('--depth-multiplier', type=float, default=1.0, + help='Number of layers / Base number of layers', dest='depth_multiplier') + + group.add_argument('--depth-alpha', type=float, default=0.5, + help='Value of alpha used in depth scaling', dest='depth_alpha') + + return parser + def _add_mup_args(parser): group = parser.add_argument_group(title='MuP') - group.add_argument('--enable-mup', type=bool, default=False, - help='Set True to use MuP', dest='enable-mup') + group.add_argument('--enable-mup', action='store_true', + help='Include in cmd to implement MuP', dest='enable_mup') + #group.add_argument('--enable-mup', type=bool, default=False, + # help='Set True to use MuP', dest='enable-mup') + group.add_argument('--mup-coord-check', type=bool, default=False, - help='Perform coordinate check for MuP', dest='mup-coord-check') + help='Perform coordinate check for MuP', dest='mup_coord_check') group.add_argument('--mup-input-weights-scale', type=float, default=1.0, - help='Scalar to multiply initial weights', dest='mup-input-weights-scale') + help='Scalar to multiply initial weights', dest='mup_input_weights_scale') group.add_argument('--mup-hidden-weights-scale', type=float, default=1.0, - help='Scalar to multiply hidden weights', dest='mup-hidden-weights-scale') + help='Scalar to multiply hidden weights', dest='mup_hidden_weights_scale') group.add_argument('--mup-output-weights-scale', type=float, default=1.0, - help='Scalar to multiply output weights', dest='mup-output-weights-scale') + help='Scalar to multiply output weights', dest='mup_output_weights_scale') group.add_argument('--mup-input-lr-scale', type=float, default=1.0, - help='To scale learning rate for input weights', dest='mup-input-lr-scale') + help='To scale learning rate for input weights', dest='mup_input_lr_scale') group.add_argument('--mup-hidden-lr-scale', type=float, default=1.0, - help='To scale learning rate for hidden weights', dest='mup-hidden-lr-scale') + help='To scale learning rate for hidden weights', dest='mup_hidden_lr_scale') group.add_argument('--mup-output-lr-scale', type=float, default=1.0, - help='To scale learning rate for output weights', dest='mup-output-lr-scale') + help='To scale learning rate for output weights', dest='mup_output_lr_scale') + return parser diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index de21046f026..9cee40c743e 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -143,6 +143,19 @@ class TransformerConfig(ModelParallelConfig): recompute_method: str = None recompute_num_layers: int = None distribute_saved_activations: bool = None + + ### Begin MuP Code ### + # MuP enabled + enable_mup: bool = True + mup_hidden_lr_scale: float = 1.0 + mup_hidden_weights_scale: float = 1.0 + + # Depth scaling enabled + enable_depth_scale: bool = True + depth_multiplier: float = 1.0 + depth_alpha: float = 1.0 + + ### End MuP Code ### def __post_init__(self): """ Python dataclass method that is used to modify attributes after initialization. @@ -212,3 +225,12 @@ def __post_init__(self): if self.output_layer_init_method is None: self.output_layer_init_method = scaled_init_method_normal(self.init_method_std, self.num_layers) + ### Begin MuP Code ### + # Check if mup-enable flag is included in args + if self.enable_mup is None: + self.enable_mup = True + + if self.enable_depth_scale is None: + self.enable_depth_scale = True + ### End MuP Code ### + diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index eebf8744ca5..dcb77409ba2 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -147,7 +147,8 @@ def __init__(self, super(Embedding, self).__init__() self.hidden_size = hidden_size - self.init_method = config.init_method + + self.num_tokentypes = num_tokentypes args = get_args() @@ -155,8 +156,10 @@ def __init__(self, # Word embeddings (parallel). self.embedding_weights_in_fp32 = embedding_weights_in_fp32 self.params_dtype = args.params_dtype + self.word_embeddings = tensor_parallel.VocabParallelEmbedding( - vocab_size, self.hidden_size, config=config, init_method=config.init_method) + vocab_size, self.hidden_size, config=config, init_method=config.init_method) + self._word_embeddings_key = 'word_embeddings' # Position embedding (serial). @@ -505,6 +508,7 @@ def __init__(self, # embedding tying that also does not have a bias. bias=False ) + self._output_layer_key = 'output_layer' def set_input_tensor(self, input_tensor): diff --git a/megatron/model/utils.py b/megatron/model/utils.py index 6c94921c958..a6e96f278b1 100644 --- a/megatron/model/utils.py +++ b/megatron/model/utils.py @@ -12,12 +12,12 @@ def init_method_normal(sigma): """Init method based on N(0, sigma).""" + ### Begin MuP Comment --- Use this function with the correctly scaled sigma value --- ### End MuP Comment def init_(tensor): return torch.nn.init.normal_(tensor, mean=0.0, std=sigma) return init_ - def scaled_init_method_normal(sigma, num_layers): """Init method based on N(0, sigma/sqrt(2*num_layers).""" std = sigma / math.sqrt(2.0 * num_layers) diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index f4ee7cc776d..9434f1064dd 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -59,6 +59,15 @@ def get_param_groups( # } # ] """ + ### Begin MuP Code ### + + args = get_args() + assert args is not None + + no_wd_no_scale_lr_depth_lr = [] + wd_mup_wd_no_scale_lr_depth_mup_lr = [] + ### End MuP Code ### + wd_no_scale_lr = [] wd_scale_lr = [] no_wd_no_scale_lr = [] @@ -66,7 +75,14 @@ def get_param_groups( galore_params = [] target_modules_list = ["attn", "mlp"] for module in modules: + print("-----------------AG DEBUG MODULE--------------------") + print(module) + print("-----------------AG DEBUG MODULE--------------------") for name, param in module.named_parameters(): + print("-----------------AG DEBUG PAR NAME--------------------") + print(name) + print("-----------------AG DEBUG PAR NAME--------------------") + if not param.requires_grad: continue @@ -82,23 +98,49 @@ def get_param_groups( scale_lr = False if not no_wd and not scale_lr: - wd_no_scale_lr.append(param) + ### Begin MuP Code ## + if args.enable_mup and args.enable_depth_scale and ( 'self_attention' in name or 'mlp.dense' in name): + print("Adding mup and depth scaling lr to -" ) + print(name) + wd_mup_wd_no_scale_lr_depth_mup_lr.append(param) + + else: + wd_no_scale_lr.append(param) + ### End MuP Code ### elif not no_wd and scale_lr: wd_scale_lr.append(param) elif no_wd and not scale_lr: - no_wd_no_scale_lr.append(param) + ### Begin MuP Code ### + if args.enable_depth_scale and ('input_layernorm' in name or 'post_attention_layernorm' in name): + print("Adding depth scaling lr to -" ) + print(name) + # Add depth scaling + no_wd_no_scale_lr_depth_lr.append(param) + elif args.enable_depth_scale and ('self_attention' in name or 'mlp.dense' in name): + print("Adding depth scaling lr to -" ) + print(name) + # Add depth scaling + no_wd_no_scale_lr_depth_lr.append(param) + else: + no_wd_no_scale_lr.append(param) + ### End MuP Code ### else: no_wd_scale_lr.append(param) param_groups = [] + ### Begin MuP Comment --- Adding _mup_lr_mult and _depth_lr_mult by default to all parameter groups --- End MuP Comment ### if len(wd_no_scale_lr): - param_groups.append({'name': 'wd_no_scale_lr', 'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0}) + param_groups.append({'name': 'wd_no_scale_lr', 'params': wd_no_scale_lr, 'wd_mult': 1.0, 'lr_mult': 1.0, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) if len(wd_scale_lr): - param_groups.append({'name': 'wd_scale_lr', 'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult}) + param_groups.append({'name': 'wd_scale_lr', 'params': wd_scale_lr, 'wd_mult': 1.0, 'lr_mult': lr_mult, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) if len(no_wd_no_scale_lr): - param_groups.append({'name': 'no_wd_no_scale_lr', 'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0}) + param_groups.append({'name': 'no_wd_no_scale_lr', 'params': no_wd_no_scale_lr, 'wd_mult': 0.0, 'lr_mult': 1.0, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) if len(no_wd_scale_lr): - param_groups.append({'name': 'no_wd_scale_lr', 'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult}) + param_groups.append({'name': 'no_wd_scale_lr', 'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) + if len(no_wd_no_scale_lr_depth_lr): + param_groups.append({'name': 'no_wd_no_scale_lr_depth_lr', 'params': no_wd_no_scale_lr_depth_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':1.0, '_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) + if len(wd_mup_wd_no_scale_lr_depth_mup_lr): + param_groups.append({'name': 'wd_mup_wd_no_scale_lr_depth_mup_lr', 'params': wd_mup_wd_no_scale_lr_depth_mup_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':args.mup_hidden_lr_scale ** (-1), '_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), '_mup_wd_mult':args.mup_hidden_weights_scale, '_depth_wd_mult':1.0}) return param_groups diff --git a/megatron/optimizer_param_scheduler.py b/megatron/optimizer_param_scheduler.py index 5933ffa048a..e75a6cee1fb 100644 --- a/megatron/optimizer_param_scheduler.py +++ b/megatron/optimizer_param_scheduler.py @@ -237,8 +237,15 @@ def step(self, increment, token_num=None): new_lr = self.get_lr() new_wd = self.get_wd() for group in self.optimizer.param_groups: - group['lr'] = new_lr * group.get('lr_mult', 1.0) - group['weight_decay'] = new_wd * group.get('wd_mult', 1.0) + ### Begin MuP Code ### + print("-----------AG DEBUG-------------") + print(group) + print("-----------AG DEBUG-------------") + + group['lr'] = new_lr * group.get('lr_mult', 1.0) * group.get('_mup_lr_mult', 1.0) * group.get('_depth_lr_mult', 1.0) + group['weight_decay'] = new_wd * group.get('wd_mult', 1.0) * group.get('_mup_wd_mult', 1.0) * group.get('_depth_wd_mult', 1.0) + + ### End MuP Code ### def state_dict(self): From 755f0c094081f5c2cedaecd4161461127f4202e4 Mon Sep 17 00:00:00 2001 From: AGupta41 Date: Mon, 14 Jul 2025 19:25:23 +0000 Subject: [PATCH 3/4] Added code for MuP coordinate-check --- ALCF/helpers.sh | 4 +- .../core/transformer/transformer_config.py | 10 +++-- megatron/model/gpt_model.py | 2 +- megatron/model/language_model.py | 24 +++++++--- megatron/model/transformer.py | 44 ++++++++++++++----- megatron/optimizer/__init__.py | 34 +++++++++----- megatron/optimizer_param_scheduler.py | 6 +-- megatron/training.py | 41 ++++++++++++++++- megatron/training_log.py | 8 +++- megatron/utils.py | 39 ++++++++++++++++ pretrain_gpt_alcf.py | 1 + 11 files changed, 173 insertions(+), 40 deletions(-) diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index e6c50af3401..460820c1c84 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -227,7 +227,7 @@ setup_run_cmd() { export MUP_BASE_WIDTH=${MUP_BASE_WIDTH:-256} export MUP_MUL=$(( $HIDDEN / $MUP_BASE_WIDTH )) mup_flags+=( - "--enable-mup" + #"--enable-mup" "--mup-coord-check=True" "--mup-hidden-weights-scale=${MUP_MUL}" "--mup-hidden-lr-scale=${MUP_MUL}" @@ -238,7 +238,7 @@ setup_run_cmd() { export DEPTH_BASE=${DEPTH_BASE:-2} export DEPTH_MUL=$(( $NLAYERS / $DEPTH_BASE )) depth_scaling_flags+=( - "--enable-depth-scale" + #"--enable-depth-scale" "--depth-base=${DEPTH_BASE}" "--depth-multiplier=${DEPTH_MUL}" "--depth-alpha=0.5") diff --git a/megatron/core/transformer/transformer_config.py b/megatron/core/transformer/transformer_config.py index 9cee40c743e..b71162d4ae1 100644 --- a/megatron/core/transformer/transformer_config.py +++ b/megatron/core/transformer/transformer_config.py @@ -146,12 +146,13 @@ class TransformerConfig(ModelParallelConfig): ### Begin MuP Code ### # MuP enabled - enable_mup: bool = True + enable_mup: bool = False + mup_coord_check: bool = False mup_hidden_lr_scale: float = 1.0 mup_hidden_weights_scale: float = 1.0 # Depth scaling enabled - enable_depth_scale: bool = True + enable_depth_scale: bool = False depth_multiplier: float = 1.0 depth_alpha: float = 1.0 @@ -228,7 +229,10 @@ def __post_init__(self): ### Begin MuP Code ### # Check if mup-enable flag is included in args if self.enable_mup is None: - self.enable_mup = True + self.enable_mup = False + + if self.mup_coord_check is None: + self.mup_coord_check = False if self.enable_depth_scale is None: self.enable_depth_scale = True diff --git a/megatron/model/gpt_model.py b/megatron/model/gpt_model.py index e5e60c43ee4..03b1492855f 100644 --- a/megatron/model/gpt_model.py +++ b/megatron/model/gpt_model.py @@ -38,7 +38,7 @@ def post_language_model_processing(lm_output, labels, logit_weights, parallel_output, fp16_lm_cross_entropy): - + # Output. Format [s b h] output = parallel_lm_logits( lm_output, diff --git a/megatron/model/language_model.py b/megatron/model/language_model.py index dcb77409ba2..c3f158300e2 100644 --- a/megatron/model/language_model.py +++ b/megatron/model/language_model.py @@ -33,14 +33,22 @@ def parallel_lm_logits(input_, word_embeddings_weight, parallel_output, input_parallel = tensor_parallel.copy_to_tensor_model_parallel_region(input_) async_grad_allreduce = False + ### Begin MuP Code ### + if args.enable_mup: + log_mult = (args.mup_hidden_weights_scale ** (-1)) + else: + log_mult = 1.0 + ### End Mup Code ### + # Matrix multiply. - logits_parallel = tensor_parallel.linear_with_grad_accumulation_and_async_allreduce( + logits_parallel = log_mult * tensor_parallel.linear_with_grad_accumulation_and_async_allreduce( input=input_parallel, weight=word_embeddings_weight, bias=bias, gradient_accumulation_fusion=args.gradient_accumulation_fusion, async_grad_allreduce=async_grad_allreduce, sequence_parallel=args.sequence_parallel) + # Gather if needed. if parallel_output: @@ -147,8 +155,7 @@ def __init__(self, super(Embedding, self).__init__() self.hidden_size = hidden_size - - + self.init_method = config.init_method ### Begin MuP Comment --- Keeping this because it might be used for initializing position embeddings ? self.num_tokentypes = num_tokentypes args = get_args() @@ -157,9 +164,16 @@ def __init__(self, self.embedding_weights_in_fp32 = embedding_weights_in_fp32 self.params_dtype = args.params_dtype - self.word_embeddings = tensor_parallel.VocabParallelEmbedding( - vocab_size, self.hidden_size, config=config, init_method=config.init_method) + ### Begin MuP Code ### -- Do this only for dense inputs + if config.enable_mup: + load_init_function = init_method_normal( config.init_method_std * (vocab_size ** (-1/2))) + else: + load_init_function = config.init_method + self.word_embeddings = tensor_parallel.VocabParallelEmbedding( + vocab_size, self.hidden_size, config=config, init_method=load_init_function) + ### End MuP Code ### + self._word_embeddings_key = 'word_embeddings' # Position embedding (serial). diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index 592ff2855b3..f8c79843ebf 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -18,7 +18,7 @@ from megatron.model.fused_softmax import FusedScaleMaskSoftmax from megatron.model.fused_bias_gelu import bias_gelu_impl from megatron.model.rotary_pos_embedding import apply_rotary_pos_emb -from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu +from megatron.model.utils import attention_mask_func, openai_gelu, erf_gelu, init_method_normal import deepspeed from deepspeed.moe.layer import MoE from deepspeed.accelerator import get_accelerator @@ -113,13 +113,20 @@ def __init__(self, config, moe=False, enable_expert_tensor_parallelism=False): ffn_hidden_size = config.ffn_hidden_size if config.gated_linear_unit: ffn_hidden_size *= 2 + + ### Begin MuP Code ### + if config.enable_mup: + load_init_method = init_method_normal(( config.mup_hidden_weights_scale ** (-1) ) * config.init_method_std ) + else: + load_init_method = config.init_method + ### End MuP Code ### # Project to 4h. If using swiglu double the output width, see https://arxiv.org/pdf/2002.05202.pdf self.dense_h_to_4h = tensor_parallel.ColumnParallelLinear( config.hidden_size, ffn_hidden_size, config=config, - init_method=config.init_method, + init_method=load_init_method, ### Changed here #config.init_method, bias=self.add_bias, gather_output=False, skip_bias_add=True, @@ -157,7 +164,7 @@ def squared_relu(x): config.ffn_hidden_size, config.hidden_size, config=config, - init_method=config.output_layer_init_method, + init_method=load_init_method, ### Changed here #config.output_layer_init_method, bias=self.add_bias, input_is_parallel=True, moe=moe, @@ -663,33 +670,41 @@ def __init__( assert self.hidden_size_per_attention_head == core.utils.divide( kv_projection_size, config.num_key_value_heads ) + + ### Begin MuP Code ### + if config.enable_mup: + print("------------------SCALING VARIANCE OF ATTN----------------") + load_init_method = init_method_normal( (config.mup_hidden_weights_scale ** (-1)) * config.init_method_std) + else: + load_init_method = config.init_method + ### End MuP Code ### # Strided linear layer. if attention_type == AttnType.self_attn: - self.query_key_value = tensor_parallel.ColumnParallelLinear( + self.query_key_value = tensor_parallel.ColumnParallelLinear( ### Changed here config.hidden_size, projection_size + 2 * kv_projection_size, config=config, - init_method=config.init_method, + init_method=load_init_method, ### Changed here bias=args.add_bias_linear, gather_output=False, ) else: assert attention_type == AttnType.cross_attn - self.query = tensor_parallel.ColumnParallelLinear( - config.hidden_size, + self.query = tensor_parallel.ColumnParallelLinear( ### Changed here + config.hidden_size, projection_size, config=config, - init_method=config.init_method, + init_method=load_init_method, ### Changed here bias=config.add_bias_linear, gather_output=False, ) - self.key_value = tensor_parallel.ColumnParallelLinear( + self.key_value = tensor_parallel.ColumnParallelLinear( ### Changed here config.hidden_size, 2 * projection_size, config=config, - init_method=config.init_method, + init_method=load_init_method, ### Changed here bias=config.add_bias_linear, gather_output=False, ) @@ -739,7 +754,7 @@ def __init__( projection_size, config.hidden_size, config=config, - init_method=config.output_layer_init_method, + init_method=load_init_method, ### Changed here #config.output_layer_init_method, bias=args.add_bias_linear, input_is_parallel=True, skip_bias_add=True, @@ -1480,6 +1495,13 @@ def forward( inference_params=inference_params, rotary_pos_emb=rotary_pos_emb, ) + + ### Begin MuP Code ### + args = get_args() + + #if args.enable_mup: + # attention_output = ( args.mup_hidden_weights_scale ** (-1) ) * attention_output + ### End MuP Code ### # Residual connection. if self.apply_residual_connection_post_layernorm: diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 9434f1064dd..1869adffffc 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -75,9 +75,9 @@ def get_param_groups( galore_params = [] target_modules_list = ["attn", "mlp"] for module in modules: - print("-----------------AG DEBUG MODULE--------------------") - print(module) - print("-----------------AG DEBUG MODULE--------------------") + #print("-----------------AG DEBUG MODULE--------------------") + #print(module) + #print("-----------------AG DEBUG MODULE--------------------") for name, param in module.named_parameters(): print("-----------------AG DEBUG PAR NAME--------------------") print(name) @@ -99,9 +99,10 @@ def get_param_groups( if not no_wd and not scale_lr: ### Begin MuP Code ## - if args.enable_mup and args.enable_depth_scale and ( 'self_attention' in name or 'mlp.dense' in name): - print("Adding mup and depth scaling lr to -" ) - print(name) + if (args.enable_mup or args.enable_depth_scale) and ( 'self_attention' in name or 'mlp.dense' in name): + #print("Adding mup and depth scaling lr to -" ) + #print(name) + print("--------------- MUP flag enabled --------------------------") wd_mup_wd_no_scale_lr_depth_mup_lr.append(param) else: @@ -112,13 +113,13 @@ def get_param_groups( elif no_wd and not scale_lr: ### Begin MuP Code ### if args.enable_depth_scale and ('input_layernorm' in name or 'post_attention_layernorm' in name): - print("Adding depth scaling lr to -" ) - print(name) + #print("Adding depth scaling lr to -" ) + #print(name) # Add depth scaling no_wd_no_scale_lr_depth_lr.append(param) elif args.enable_depth_scale and ('self_attention' in name or 'mlp.dense' in name): - print("Adding depth scaling lr to -" ) - print(name) + #print("Adding depth scaling lr to -" ) + #print(name) # Add depth scaling no_wd_no_scale_lr_depth_lr.append(param) else: @@ -138,9 +139,9 @@ def get_param_groups( if len(no_wd_scale_lr): param_groups.append({'name': 'no_wd_scale_lr', 'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) if len(no_wd_no_scale_lr_depth_lr): - param_groups.append({'name': 'no_wd_no_scale_lr_depth_lr', 'params': no_wd_no_scale_lr_depth_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':1.0, '_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) + param_groups.append({'name': 'no_wd_no_scale_lr_depth_lr', 'params': no_wd_no_scale_lr_depth_lr, 'wd_mult': 0.0, 'lr_mult': 1.0, '_mup_lr_mult':1.0, '_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) if len(wd_mup_wd_no_scale_lr_depth_mup_lr): - param_groups.append({'name': 'wd_mup_wd_no_scale_lr_depth_mup_lr', 'params': wd_mup_wd_no_scale_lr_depth_mup_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':args.mup_hidden_lr_scale ** (-1), '_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), '_mup_wd_mult':args.mup_hidden_weights_scale, '_depth_wd_mult':1.0}) + param_groups.append({'name': 'wd_mup_wd_no_scale_lr_depth_mup_lr', 'params': wd_mup_wd_no_scale_lr_depth_mup_lr, 'wd_mult': 1.0, 'lr_mult': 1.0, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':args.mup_hidden_weights_scale, '_depth_wd_mult':1.0}) ### '_mup_lr_mult':args.mup_hidden_lr_scale ** (-1),'_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), return param_groups @@ -170,6 +171,15 @@ def get_megatron_optimizer( ) optimizer = None + + ### Begin MuP Code ### -- args is a mutable object + if args.enable_mup: + args.adam_eps = args.adam_eps * (args.mup_hidden_lr_scale ** (-1)) + elif args.enable_mup and args.enable_depth_scale: + args.adam_eps = args.adam_eps * (args.mup_hidden_lr_scale ** (-1)) * (args.depth_multiplier ** (-args.depth_alpha)) + ### End MuP Code ### + + # ---- CPU Optimizer -------------------------------------- if args.cpu_optimizer: assert args.optimizer == 'adam', 'CPU offloading is for Adam' diff --git a/megatron/optimizer_param_scheduler.py b/megatron/optimizer_param_scheduler.py index e75a6cee1fb..f2c2572e93f 100644 --- a/megatron/optimizer_param_scheduler.py +++ b/megatron/optimizer_param_scheduler.py @@ -238,9 +238,9 @@ def step(self, increment, token_num=None): new_wd = self.get_wd() for group in self.optimizer.param_groups: ### Begin MuP Code ### - print("-----------AG DEBUG-------------") - print(group) - print("-----------AG DEBUG-------------") + #print("-----------AG DEBUG-------------") + #print(group) + #print("-----------AG DEBUG-------------") group['lr'] = new_lr * group.get('lr_mult', 1.0) * group.get('_mup_lr_mult', 1.0) * group.get('_depth_lr_mult', 1.0) group['weight_decay'] = new_wd * group.get('wd_mult', 1.0) * group.get('_mup_wd_mult', 1.0) * group.get('_depth_wd_mult', 1.0) diff --git a/megatron/training.py b/megatron/training.py index 43710fd691d..1b87eb3c5f6 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -64,6 +64,7 @@ found_kill_switch, unwrap_model, update_rotary_pos_emb, + mup_coord_check, ### Begin MuP Change ### ) from megatron.profiler import ( @@ -160,6 +161,7 @@ def pretrain( Returns: model (torch.nn.Module) """ + # Initalize and get arguments, timers, and Tensorboard writer. initialize_megatron( extra_args_provider=extra_args_provider, @@ -963,7 +965,7 @@ def train_step( skipped_iter = 0 else: skipped_iter = 1 - + # Empty unused memory. if args.empty_unused_memory_level >= 2 and accelerator is not None: accelerator.empty_cache() @@ -1017,7 +1019,6 @@ def train( assert accelerator is not None setup_profiler(args, accelerator.device_name()) - if args.random_ltd: # random-ltd requires different randomness on each rank import random @@ -1279,6 +1280,8 @@ def train( ) iteration += 1 args.iteration = iteration + + new_samples = ( mpu.get_data_parallel_world_size() * args.micro_batch_size @@ -1326,6 +1329,40 @@ def train( params_norm = None if args.log_params_norm: params_norm = calc_params_l2_norm(model) + + ### Begin MuP Code ### + if args.enable_mup and args.mup_coord_check: + if iteration <=20: + mup_coord_check(model) + + ''' + temp_list = [] + for name, params in model[0].named_parameters(): + + if 'word_embeddings' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + elif '8.mlp.dense_4h_to' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + elif 'output_layer' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + + print(temp_list) + with open("mup_coord_check/adamw_h2048_s123456.txt", "a") as file: + + for item in temp_list: + file.write( "%s " % item ) + file.write('\n') + ''' + ### End MuP Code ### report_memory_flag = training_log( loss_dict, total_loss_dict, diff --git a/megatron/training_log.py b/megatron/training_log.py index 3eb96c392d9..89285cecfec 100644 --- a/megatron/training_log.py +++ b/megatron/training_log.py @@ -26,6 +26,7 @@ num_floating_point_operations, ) +import csv RANK: int = ez.get_rank() WORLD_SIZE: int = ez.get_world_size() @@ -101,7 +102,7 @@ def training_log( total_loss_dict[nan_iters_key] = total_loss_dict.get(nan_iters_key, 0) + int( got_nan ) - + # Logging. timers_to_log = [ "forward-backward", @@ -262,6 +263,11 @@ def training_log( params_norm, args.consumed_train_tokens, ) + ### AG Debug ### + print("------- param norm --------") + print(params_norm) + print("------- param norm --------") + ### AG Debug ### if hasattr(args, "actual_seq_length"): writer.add_scalar( "seqlen/actual_seq_length", args.actual_seq_length, iteration diff --git a/megatron/utils.py b/megatron/utils.py index 5f1c66640d1..9529728a8db 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -125,6 +125,45 @@ def finalize(self): PerfTrace = dftracer() DFTRACER_ENABLE = False +### Begin MuP Code ### +def mup_coord_check(model): + + args = get_args() + + temp_list = [] + for name, params in model[0].named_parameters(): + + if 'word_embeddings' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + #elif '8.mlp.dense_4h_to' in name: + elif 'layers.0.' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + elif 'layers.9.' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + elif 'output_layer' in name: + print("-------------------------------------------") + print(name) + print( (params.data.float()).abs().mean().item() ) + temp_list.append((params.data.float()).abs().mean().item()) + + print(temp_list) + + file_name = f"adamw_hidden{args.hidden_size}_ffn{args.ffn_hidden_size}_depth{args.num_layers}_s1234_v3.txt" + with open(f"mup_coord_check/{file_name}", "a") as file: + for item in temp_list: + file.write( "%s " % item ) + file.write('\n') +### End MuP Code ### + def get_logger( name: str, diff --git a/pretrain_gpt_alcf.py b/pretrain_gpt_alcf.py index c196b19a420..0f171f6f2ed 100644 --- a/pretrain_gpt_alcf.py +++ b/pretrain_gpt_alcf.py @@ -450,6 +450,7 @@ def forward_step(data_iterator, model) -> tuple[torch.Tensor | None, Callable]: # Output_tensor stores the standard loss, # loss_func calculates the total loss. + return output_tensor, partial(loss_func, loss_mask, moe_loss, mos_loss) From f665983637c1f2cbf6357083cf21fd04b9c29884 Mon Sep 17 00:00:00 2001 From: AGupta41 Date: Thu, 17 Jul 2025 14:17:07 +0000 Subject: [PATCH 4/4] Depth Scaling --- .env- | 414 +++++++++++++++++++++++++++++++++ ALCF/helpers.sh | 2 +- megatron/model/transformer.py | 61 +++-- megatron/optimizer/__init__.py | 25 +- megatron/training.py | 31 +-- megatron/utils.py | 5 +- 6 files changed, 484 insertions(+), 54 deletions(-) create mode 100644 .env- diff --git a/.env- b/.env- new file mode 100644 index 00000000000..23b90635c25 --- /dev/null +++ b/.env- @@ -0,0 +1,414 @@ +CRAY_LMOD_MPI=cray-mpich/8.0 +SLURM_MPI_TYPE=cray_shasta +LMOD_FAMILY_CRAYPE_NETWORK_VERSION=false +XALT_DIR=/soft/xalt/3.0.2-202408282050 +CONDA_SHLVL=1 +WORLD_SIZE=0 +MPICH_GPU_SUPPORT_ENABLED=1 +LD_LIBRARY_PATH=/soft/libraries/hwloc/lib/:/soft/libraries/aws-ofi-nccl/v1.9.1-aws/lib:/soft/compilers/cudatoolkit/cuda-12.4.1/extras/CUPTI/lib64:/soft/compilers/cudatoolkit/cuda-12.4.1/lib64:/soft/libraries/trt/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-12.0/lib:/soft/libraries/nccl/nccl_2.21.5-1+cuda12.4_x86_64/lib:/soft/libraries/cudnn/cudnn-cuda12-linux-x64-v9.1.0.70/lib:/soft/perftools/darshan/darshan-3.4.4/lib:/opt/cray/pe/papi/7.0.1.2/lib64:/opt/cray/libfabric/1.15.2.0/lib64:/dbhome/db2cat/sqllib/lib64:/dbhome/db2cat/sqllib/lib64/gskit:/dbhome/db2cat/sqllib/lib32 +LS_COLORS=no=00:fi=00:di=01;34:ln=00;36:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=41;33;01:ex=00;32:*.cmd=00;32:*.exe=01;32:*.com=01;32:*.bat=01;32:*.btm=01;32:*.dll=01;32:*.tar=00;31:*.tbz=00;31:*.tgz=00;31:*.rpm=00;31:*.deb=00;31:*.arj=00;31:*.taz=00;31:*.lzh=00;31:*.lzma=00;31:*.zip=00;31:*.zoo=00;31:*.z=00;31:*.Z=00;31:*.gz=00;31:*.bz2=00;31:*.tb2=00;31:*.tz2=00;31:*.tbz2=00;31:*.xz=00;31:*.avi=01;35:*.bmp=01;35:*.dl=01;35:*.fli=01;35:*.gif=01;35:*.gl=01;35:*.jpg=01;35:*.jpeg=01;35:*.mkv=01;35:*.mng=01;35:*.mov=01;35:*.mp4=01;35:*.mpg=01;35:*.pcx=01;35:*.pbm=01;35:*.pgm=01;35:*.png=01;35:*.ppm=01;35:*.svg=01;35:*.tga=01;35:*.tif=01;35:*.webm=01;35:*.webp=01;35:*.wmv=01;35:*.xbm=01;35:*.xcf=01;35:*.xpm=01;35:*.aiff=00;32:*.ape=00;32:*.au=00;32:*.flac=00;32:*.m4a=00;32:*.mid=00;32:*.mp3=00;32:*.mpc=00;32:*.ogg=00;32:*.voc=00;32:*.wav=00;32:*.wma=00;32:*.wv=00;32: +CONDA_EXE=/soft/applications/conda/2024-04-29/mconda3/bin/conda +HOSTTYPE=x86_64 +BRIGHT_YELLOW=\e[1;93m +CRAY_DSMML_ROOTDIR=/opt/cray/pe/dsmml/0.2.2 +GLOBAL_BATCH=0 +OPT=adamw +MPI4JAX_USE_CUDA_MPI=1 +DATA_FILE_LIST=./ALCF/data-lists/polaris/books.txt +CRAY_DSMML_DIR=/opt/cray/pe/dsmml/0.2.2/dsmml +__LMOD_REF_COUNT_PATH=/soft/xalt/3.0.2-202408282050/bin:1;/soft/compilers/cudatoolkit/cuda-12.4.1/bin:1;/soft/libraries/nccl/nccl_2.21.5-1+cuda12.4_x86_64/include:1;/opt/cray/pe/hdf5-parallel/1.12.2.9/bin:1;/opt/cray/pe/hdf5/1.12.2.9/bin:1;/opt/cray/pals/1.3.4/bin:1;/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/bin:1;/opt/cray/pe/mpich/8.1.28/bin:1;/opt/cray/pe/craype/2.7.30/bin:1;y/condabin:1;/soft/perftools/darshan/darshan-3.4.4/bin:1;/opt/cray/pe/perftools/23.12.0/bin:1;/opt/cray/pe/papi/7.0.1.2/bin:1;/opt/cray/libfabric/1.15.2.0/bin:1;/opt/clmgr/sbin:1;/opt/clmgr/bin:1;/opt/sgi/sbin:1;/opt/sgi/bin:1;/home/akshita04/.local/bin:1;/usr/local/bin:1;/usr/bin:1;/bin:2;/opt/c3/bin:1;/dbhome/db2cat/sqllib/bin:1;/dbhome/db2cat/sqllib/adm:1;/dbhome/db2cat/sqllib/misc:1;/dbhome/db2cat/sqllib/gskit/bin:1;/usr/lib/mit/bin:1;/usr/lib/mit/sbin:1;/opt/pbs/bin:1;/sbin:1;/opt/cray/pe/bin:1 +_ModuleTable002_=bG1vZC9tb2R1bGVmaWxlcy9jb3JlL1ByZ0Vudi1nbnUvOC41LjAubHVhIiwKZnVsbE5hbWUgPSAiUHJnRW52LWdudS84LjUuMCIsCmxvYWRPcmRlciA9IDE0LApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMiwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gIlByZ0Vudi1nbnUiLAp3ViA9ICJeMDAwMDAwMDguMDAwMDAwMDA1Lip6ZmluYWwiLAp9LApjb25kYSA9IHsKZm4gPSAiL3NvZnQvbW9kdWxlZmlsZXMvY29uZGEvMjAyNC0wNC0yOS5sdWEiLApmdWxsTmFtZSA9ICJjb25kYS8yMDI0LTA0LTI5IiwKbG9hZE9yZGVyID0gMTcsCnByb3BUID0ge30sCnN0YWNrRGVwdGggPSAwLApzdGF0dXMgPSAiYWN0aXZlIiwKdXNlck5hbWUgPSAiY29uZGEiLAp3ViA9ICJeMDAw +HTTP_PROXY=http://proxy.alcf.anl.gov:3128 +PE_MPICH_GTL_DIR_amd_gfx90a=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +SSH_CONNECTION=67.176.177.180 54189 140.221.112.12 22 +TOKENIZER_FLAGS=--tokenizer-type Llama2Tokenizer --tokenizer-model /lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/ALCF/tokenizer.model +LESSCLOSE=lessclose.sh %s %s +CRAY_LD_LIBRARY_PATH=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3/lib:/opt/cray/pe/pmi/6.1.13/lib:/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib:/opt/cray/pe/mpich/8.1.28/gtl/lib:/opt/cray/pe/dsmml/0.2.2/dsmml/lib:/opt/cray/pe/perftools/23.12.0/lib64 +IBM_DB_LIB=/dbhome/db2cat/sqllib/lib +XKEYSYMDB=/usr/X11R6/lib/X11/XKeysymDB +PE_ENV=GNU +__LMOD_REF_COUNT_PE_DSMML_PKGCONFIG_LIBS=dsmml:1 +__LMOD_REF_COUNT_PE_FORTRAN_PKGCONFIG_LIBS=hdf5hl_fortran_parallel:1;hdf5_fortran_parallel:1;mpichf90:1 +LMOD_FAMILY_CRAYPE_CPU=craype-x86-milan +CRAYPAT_LD_LIBRARY_PATH=/opt/cray/pe/perftools/23.12.0/lib64 +CRAY_LMOD_COMPILER=gnu/12.0 +GCC_PREFIX=/usr/lib64/gcc/x86_64-suse-linux/12 +DATA_FLAGS= +CRAY_DSMML_PREFIX=/opt/cray/pe/dsmml/0.2.2/dsmml +PE_PRODUCT_LIST=CRAYPE_X86_MILAN +CRAYPAT_ROOT=/opt/cray/pe/perftools/23.12.0 +LMOD_FAMILY_MPI_VERSION=8.1.28 +PE_PERFTOOLS_MPICH_LIBDIR=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib +TIMING_LOG_LEVEL=1 +PE_MPICH_GTL_DIR_amd_gfx906=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +LANG=en_US.UTF-8 +WEIGHT_DECAY=0.1 +TP=1 +HIDDEN=1024 +PE_MPICH_GTL_DIR_amd_gfx908=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +PE_DSMML_MODULE_NAME=cray-dsmml +WINDOWMANAGER=xterm +_ModuleTable008_=ImNyYXlwZS1uZXR3b3JrLW9maSJdID0gewpmbiA9ICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jcmF5cGUtdGFyZ2V0cy9kZWZhdWx0L2NyYXlwZS1uZXR3b3JrLW9maS5sdWEiLApmdWxsTmFtZSA9ICJjcmF5cGUtbmV0d29yay1vZmkiLApsb2FkT3JkZXIgPSAyLApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMCwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gImNyYXlwZS1uZXR3b3JrLW9maSIsCndWID0gIk0uKnpmaW5hbCIsCn0sClsiY3JheXBlLXg4Ni1taWxhbiJdID0gewpmbiA9ICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jcmF5cGUtdGFyZ2V0cy9kZWZhdWx0L2NyYXlwZS14ODYtbWlsYW4ubHVhIiwKZnVsbE5hbWUgPSAiY3JheXBl +LESS=-M -I -R +NCCL=nccl +PE_MPICH_GTL_LIBS_nvidia70=-lmpi_gtl_cuda +WHITE=\e[1;37m +LMOD_FAMILY_COMPILER_VERSION=12.3 +HOSTNAME=polaris-login-02 +LMOD_SYSTEM_DEFAULT_MODULES=PrgEnv-nvhpc:craype-network-ofi:perftools-base:darshan:xalt +OLDPWD=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/megatron/data +CRAY_HDF5_PARALLEL_VERSION=1.12.2.9 +FI_CXI_DEFAULT_CQ_SIZE=131072 +CSHEDIT=emacs +HDF5_ROOT=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3 +MACHINE=polaris-login-02 +PBS_JOBID= +GPG_TTY=/dev/pts/243 +LESS_ADVANCED_PREPROCESSOR=no +BRIGHT_BLUE=\e[1;94m +__LMOD_REF_COUNT_PE_MPICH_GENCOMPILERS_GNU=12.3:1 +WORKING_DIR=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP +NCCL_COLLNET_ENABLE=1 +COLORTERM=1 +CUDA_PATH=/soft/compilers/cudatoolkit/cuda-12.4.1/ +GSETTINGS_SCHEMA_DIR_CONDA_BACKUP= +EXEC_STEM=pretrain_gpt_alcf +CRAY_PERFTOOLS_VERSION=23.12.0 +_ModuleTable007_=YXktcG1pLzYuMS4xMyIsCmxvYWRPcmRlciA9IDEwLApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMiwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gImNyYXktcG1pIiwKd1YgPSAiXjAwMDAwMDA2LjAwMDAwMDAwMS4wMDAwMDAwMTMuKnpmaW5hbCIsCn0sCmNyYXlwZSA9IHsKZm4gPSAiL29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmlsZXMvY29yZS9jcmF5cGUvMi43LjMwLmx1YSIsCmZ1bGxOYW1lID0gImNyYXlwZS8yLjcuMzAiLApsb2FkT3JkZXIgPSA3LApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMiwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gImNyYXlwZSIsCndWID0gIl4wMDAwMDAwMi4wMDAwMDAwMDcuMDAwMDAwMDMwLip6ZmluYWwiLAp9LApb +NGPU_PER_HOST= +PE_HDF5_PARALLEL_FORTRAN_PKGCONFIG_LIBS=hdf5hl_fortran_parallel:hdf5_fortran_parallel +CONDA_PREFIX=/soft/applications/conda/2024-04-29/mconda3 +LMOD_FAMILY_CRAYPE_NETWORK=craype-network-ofi +ENV_NAME=conda/2024-04-29 +LMOD_FAMILY_CRAYPE_VERSION=2.7.30 +APP2_STATE=23.12.0 +CRAY_HDF5_PARALLEL_DIR=/opt/cray/pe/hdf5-parallel/1.12.2.9 +DTYPE=fp16 +LMOD_FAMILY_GCC_COMPILER=gcc-native +LMOD_FAMILY_HDF5=cray-hdf5-parallel +CFLAGS=-I/soft/applications/conda/2024-04-29/mconda3/include +MACHTYPE=x86_64-suse-linux +__LMOD_REF_COUNT_LD_LIBRARY_PATH=/soft/compilers/cudatoolkit/cuda-12.4.1/extras/CUPTI/lib64:1;/soft/compilers/cudatoolkit/cuda-12.4.1/lib64:1;/soft/libraries/trt/TensorRT-8.6.1.6.Linux.x86_64-gnu.cuda-12.0/lib:1;/soft/libraries/nccl/nccl_2.21.5-1+cuda12.4_x86_64/lib:1;/soft/libraries/cudnn/cudnn-cuda12-linux-x64-v9.1.0.70/lib:1;/soft/perftools/darshan/darshan-3.4.4/lib:1;/opt/cray/pe/papi/7.0.1.2/lib64:1;/opt/cray/libfabric/1.15.2.0/lib64:1;/dbhome/db2cat/sqllib/lib64:1;/dbhome/db2cat/sqllib/lib64/gskit:1;/dbhome/db2cat/sqllib/lib32:1 +__LMOD_REF_COUNT_PKG_CONFIG_PATH=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3/lib/pkgconfig:1;/opt/cray/pals/1.3.4/lib/pkgconfig:1;/opt/cray/pe/pmi/6.1.13/lib/pkgconfig:1;/opt/cray/pe/dsmml/0.2.2/dsmml/lib/pkgconfig:1;/opt/cray/pe/craype/2.7.30/pkg-config:1;/soft/perftools/darshan/darshan-3.4.4/lib/pkgconfig:1;/opt/cray/libfabric/1.15.2.0/lib64/pkgconfig:1 +__LMOD_Priority_PATH=/soft/xalt/3.0.2-202408282050/bin:-100 +GNU_VERSION=12.3 +XALT_EXECUTABLE_TRACKING=yes +PE_PKGCONFIG_PRODUCTS=PE_PALS:PE_PMI:PE_MPICH:PE_DSMML +_ModuleTable015_=dWxlZmlsZXMiLCAiL29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmlsZXMvY29yZSIsICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jcmF5cGUtdGFyZ2V0cy9kZWZhdWx0IgosICIvc29mdC9wZXJmdG9vbHMvZGFyc2hhbi9kYXJzaGFuLTMuNC40L3NoYXJlL2NyYXlwZS0yLngvbW9kdWxlZmlsZXMiLCAiL3NvZnQveGFsdC9tb2R1bGVmaWxlcyIsCn0sCnN5c3RlbUJhc2VNUEFUSCA9ICIvdXNyL3NoYXJlL21vZHVsZWZpbGVzL0xpbnV4Oi91c3Ivc2hhcmUvbW9kdWxlZmlsZXMvQ29yZTovdXNyL3NoYXJlL2xtb2QvbG1vZC9tb2R1bGVmaWxlcy9Db3JlOi91c3Ivc2hhcmUvbG1vZC9sbW9kL21vZHVsZWZpbGVzOi9vcHQvY3JheS9wYWxzL2xtb2QvbW9kdWxlZmlsZXMv +CRAY_MPICH_BASEDIR=/opt/cray/pe/mpich/8.1.28/ofi +PE_MPICH_GTL_DIR_nvidia80=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +_ModuleTable004_=ZmkvMS4wL2NyYXktbXBpY2gvOC4wL2NyYXktaGRmNS1wYXJhbGxlbC8xLjEyLjIuOS5sdWEiLApmdWxsTmFtZSA9ICJjcmF5LWhkZjUtcGFyYWxsZWwvMS4xMi4yLjkiLApsb2FkT3JkZXIgPSAxNSwKcHJvcFQgPSB7fSwKcmVmX2NvdW50ID0gMSwKc3RhY2tEZXB0aCA9IDEsCnN0YXR1cyA9ICJhY3RpdmUiLAp1c2VyTmFtZSA9ICJjcmF5LWhkZjUtcGFyYWxsZWwvMS4xMi4yLjkiLAp3ViA9ICJeMDAwMDAwMDEuMDAwMDAwMDEyLjAwMDAwMDAwMi4wMDAwMDAwMDkuKnpmaW5hbCIsCn0sClsiY3JheS1saWJwYWxzIl0gPSB7CmZuID0gIi9vcHQvY3JheS9wYWxzL2xtb2QvbW9kdWxlZmlsZXMvY29yZS9jcmF5LWxpYnBhbHMvMS4zLjQubHVhIiwKZnVsbE5hbWUgPSAiY3JheS1s +MAMBA_EXE=/home/akshita04/.local/bin/micromamba +VIRTUAL_ENV=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/venvs/polaris/mconda3 +CRAY_HDF5_PARALLEL_PREFIX=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3 +MINICOM=-c on +__LMOD_REF_COUNT_PYTHONPATH=/soft/xalt/3.0.2-202408282050/site_packages:1 +C3_RSH=ssh -oConnectTimeout=10 -oForwardX11=no +PE_MPICH_PKGCONFIG_VARIABLES=PE_MPICH_GTL_DIR_@accelerator@:PE_MPICH_GTL_LIBS_@accelerator@ +PE_MPICH_PKGCONFIG_LIBS=mpich +_CE_M= +LMOD_FAMILY_PYTHON=conda +QT_SYSTEM_DIR=/usr/share/desktop-data +OSTYPE=linux +ZERO_STAGE=1 +HEADS=32 +CLASSPATH=/dbhome/db2cat/sqllib/java/db2java.zip:/dbhome/db2cat/sqllib/java/sqlj4.zip:/dbhome/db2cat/sqllib/function:/dbhome/db2cat/sqllib/java/db2jcc_license_cu.jar:/dbhome/db2cat/sqllib/tools/clpplus.jar:/dbhome/db2cat/sqllib/tools/jline-0.9.93.jar:/dbhome/db2cat/sqllib/java/db2jcc4.jar:. +XLA_PYTHON_CLIENT_PREALLOCATE=false +TOKENIZER_MODEL=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/ALCF/tokenizer.model +XDG_SESSION_ID=21407 +CRAY_DSMML_VER=0.2.2 +GRAD_ACC_STEPS=8 +PBS_O_WORKDIR=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP +USER=akshita04 +PAGER=less +LMOD_FAMILY_PRGENV_VERSION=8.5.0 +LMOD_FAMILY_PYTHON_VERSION=2024-04-29 +_ModuleTable012_=cy1iYXNlLzIzLjEyLjAubHVhIiwKZnVsbE5hbWUgPSAicGVyZnRvb2xzLWJhc2UvMjMuMTIuMCIsCmxvYWRPcmRlciA9IDMsCnByb3BUID0ge30sCnN0YWNrRGVwdGggPSAwLApzdGF0dXMgPSAiYWN0aXZlIiwKdXNlck5hbWUgPSAicGVyZnRvb2xzLWJhc2UiLAp3ViA9ICJeMDAwMDAwMjMuMDAwMDAwMDEyLip6ZmluYWwiLAp9LAp4YWx0ID0gewpmbiA9ICIvc29mdC94YWx0L21vZHVsZWZpbGVzL3hhbHQvMy4wLjItMjAyNDA4MjgyMDUwIiwKZnVsbE5hbWUgPSAieGFsdC8zLjAuMi0yMDI0MDgyODIwNTAiLApsb2FkT3JkZXIgPSA1LApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMCwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gInhhbHQiLAp3ViA9ICJeMDAwMDAw +PE_MPICH_GTL_LIBS_ponteVecchio=-lmpi_gtl_ze +CKPT_DIR=checkpoints/ws0_ds_stage1_nl10_hs1024_mb2_seq2048_gb0_sp1_pp1_tp1_fp16_optadamw_lr_lwf_flash +WANDB_MODE=disabled +__LMOD_REF_COUNT_LD_PRELOAD=/soft/xalt/3.0.2-202408282050/lib64/libxalt_init.so:1 +NCCL_CROSS_NIC=1 +CRAY_CPU_TARGET=x86-milan +CUDNN_HOME=/soft/libraries/cudnn/cudnn-cuda12-linux-x64-v9.1.0.70/ +PE_GCC_LEVEL=12 +MODEL_TYPE=llama-gb0-seq2048-pp1-tp1-10layers-32heads-1024hidden +__LMOD_REF_COUNT_MODULEPATH=/opt/cray/pe/lmod/modulefiles/hdf5-parallel/gnu/12.0/ofi/1.0/cray-mpich/8.0/cray-hdf5-parallel/1.12.2:1;/opt/cray/pe/lmod/modulefiles/cpu/x86-milan/1.0:1;/opt/cray/pe/lmod/modulefiles/mpi/gnu/12.0/ofi/1.0/cray-mpich/8.0:1;/opt/cray/pe/lmod/modulefiles/comnet/gnu/12.0/ofi/1.0:1;/opt/cray/pe/lmod/modulefiles/mix_compilers:1;/opt/cray/pe/lmod/modulefiles/compiler/gnu/12.0:1;/soft/modulefiles:1;/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:1;/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:1;/usr/share/modulefiles/Linux:1;/usr/share/modulefiles/Core:1;/usr/share/lmod/lmod/modulefiles/Core:1;/usr/share/lmod/lmod/modulefiles:1;/opt/cray/pals/lmod/modulefiles/core:1;/opt/cray/modulefiles:1;/opt/cray/pe/lmod/modulefiles/core:1;/opt/cray/pe/lmod/modulefiles/craype-targets/default:1;/soft/perftools/darshan/darshan-3.4.4/share/craype-2.x/modulefiles:1;/soft/xalt/modulefiles:1 +PKGCONFIG_ENABLED=1 +MORE=-sl +CRAY_PERFTOOLS_PREFIX=/opt/cray/pe/perftools/23.12.0 +PE_FORTRAN_PKGCONFIG_LIBS=hdf5hl_fortran_parallel:hdf5_fortran_parallel:mpichf90 +__LMOD_REF_COUNT_CRAY_LD_LIBRARY_PATH=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3/lib:1;/opt/cray/pe/pmi/6.1.13/lib:1;/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib:1;/opt/cray/pe/mpich/8.1.28/gtl/lib:1;/opt/cray/pe/dsmml/0.2.2/dsmml/lib:1;/opt/cray/pe/perftools/23.12.0/lib64:1 +IBM_DB_HOME=/dbhome/db2cat/sqllib +GCC_VERSION=12.3 +TODAY=2025-07-16 +PP=1 +PWD=/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP +NCCL_NET=AWS Libfabric +PE_MPICH_GTL_DIR_ponteVecchio=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +PE_MPICH_GTL_DIR_nvidia70=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +HOME=/home/akshita04 +CRAY_PMI_INCLUDE_OPTS=-I/opt/cray/pe/pmi/6.1.13/include +TOKENIZER_TYPE=Llama2Tokenizer +FI_CXI_DISABLE_HOST_REGISTER=1 +CONDA_PYTHON_EXE=/soft/applications/conda/2024-04-29/mconda3/bin/python +PELOCAL_PRGENV=true +__LMOD_REF_COUNT_PE_PALS_PKGCONFIG_LIBS=libpals:1 +PE_MPICH_GTL_LIBS_amd_gfx942=-lmpi_gtl_hsa +PE_MPICH_GTL_LIBS_amd_gfx940=-lmpi_gtl_hsa +HOST=polaris-login-02 +__LMOD_REF_COUNT_PE_PMI_PKGCONFIG_LIBS=cray-pmi:1 +SSH_CLIENT=67.176.177.180 54189 22 +__LMOD_REF_COUNT_PE_GNU_FIXED_PKGCONFIG_PATH=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib/pkgconfig:1 +__LMOD_REF_COUNT_COMPILER_PATH=/soft/xalt/3.0.2-202408282050/bin:1 +LMOD_VERSION=8.7.34 +DFL_STEM=books +CUDA_HOME=/soft/compilers/cudatoolkit/cuda-12.4.1/ +OUTPUT_LOG=logs/ws0_ds_stage1_nl10_hs1024_mb2_seq2048_gb0_sp1_pp1_tp1_fp16_optadamw_lr_lwf_flash/20250716-003739_0_polaris-login-02/output.log +XNLSPATH=/usr/X11R6/lib/X11/nls +SEQ=2048 +PE_MPICH_FORTRAN_PKGCONFIG_LIBS=mpichf90 +XDG_SESSION_TYPE=tty +CRAY_LMOD_CPU=x86-milan/1.0 +PE_MPICH_GTL_LIBS_nvidia80=-lmpi_gtl_cuda +HERE=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP +https_proxy=http://proxy.alcf.anl.gov:3128 +LMOD_FAMILY_CRAYPE_CPU_VERSION=false +RED=\e[1;31m +IBM_DB_INCLUDE=/dbhome/db2cat/sqllib/include +GCC_PATH=/usr/bin +BASH_ENV=/usr/share/lmod/lmod/init/bash +XDG_DATA_DIRS=/usr/share +PE_PALS_PKGCONFIG_LIBS=libpals +__LMOD_REF_COUNT_PE_MPICH_FORTRAN_PKGCONFIG_LIBS=mpichf90:1 +BRIGHT_MAGENTA=\e[1;95m +DOTENV_FILE=checkpoints/ws0_ds_stage1_nl10_hs1024_mb2_seq2048_gb0_sp1_pp1_tp1_fp16_optadamw_lr_lwf_flash/.env +CRAY_LMOD_NET=ofi/1.0 +LMOD_FAMILY_GCC_COMPILER_VERSION=12.3 +LMOD_FAMILY_HDF5_VERSION=1.12.2.9 +http_proxy=http://proxy.alcf.anl.gov:3128 +NLAYERS=10 +__LMOD_REF_COUNT_PE_PKGCONFIG_PRODUCTS=PE_PALS:1;PE_PMI:1;PE_MPICH:1;PE_DSMML:1 +PE_PMI_PKGCONFIG_LIBS=cray-pmi +PE_MPICH_MODULE_NAME=cray-mpich +_CE_CONDA= +LIBGL_DEBUG=quiet +GSETTINGS_SCHEMA_DIR=/soft/applications/conda/2024-04-29/mconda3/share/glib-2.0/schemas +__LMOD_REF_COUNT_PE_MPICH_FIXED_PRGENV=GNU:1 +COMPILER_PATH=/soft/xalt/3.0.2-202408282050/bin +PE_MPICH_GTL_DIR_amd_gfx940=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +PE_MPICH_GTL_DIR_amd_gfx942=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +OUTPUT_DIR=logs/ws0_ds_stage1_nl10_hs1024_mb2_seq2048_gb0_sp1_pp1_tp1_fp16_optadamw_lr_lwf_flash/20250716-003739_0_polaris-login-02 +USE_ACTIVATION_CHECKPOINTING= +PROFILEREAD=true +VENV_DIR=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/venvs/polaris/mconda3 +FFN_HIDDEN_SIZE=4096 +IBM_DB_DIR=/dbhome/db2cat/sqllib +PERFTOOLS_VERSION=23.12.0 +LMOD_sys=Linux +PE_PKGCONFIG_LIBS=hdf5_hl_parallel:hdf5_parallel:mpich:dsmml:darshan-runtime +_ModuleTable010_=YXJzaGFuL2RhcnNoYW4tMy40LjQvc2hhcmUvY3JheXBlLTIueC9tb2R1bGVmaWxlcy9kYXJzaGFuLzMuNC40IiwKZnVsbE5hbWUgPSAiZGFyc2hhbi8zLjQuNCIsCmxvYWRPcmRlciA9IDQsCnByb3BUID0ge30sCnN0YWNrRGVwdGggPSAwLApzdGF0dXMgPSAiYWN0aXZlIiwKdXNlck5hbWUgPSAiZGFyc2hhbiIsCndWID0gIjAwMDAwMDAwMy4wMDAwMDAwMDQuMDAwMDAwMDA0Lip6ZmluYWwiLAp9LApbImdjYy1uYXRpdmUiXSA9IHsKZm4gPSAiL29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmlsZXMvY29yZS9nY2MtbmF0aXZlLzEyLjMubHVhIiwKZnVsbE5hbWUgPSAiZ2NjLW5hdGl2ZS8xMi4zIiwKbG9hZE9yZGVyID0gNiwKcHJvcFQgPSB7fSwKc3RhY2tEZXB0aCA9IDIsCnN0 +CRAY_PMI_PREFIX=/opt/cray/pe/pmi/6.1.13 +NUM_KV_HEAD=8 +_ModuleTable001_=X01vZHVsZVRhYmxlXyA9IHsKTVR2ZXJzaW9uID0gMywKY19yZWJ1aWxkVGltZSA9IGZhbHNlLApjX3Nob3J0VGltZSA9IGZhbHNlLApkZXB0aFQgPSB7fSwKZmFtaWx5ID0gewpQcmdFbnYgPSAiUHJnRW52LWdudSIsCmNvbXBpbGVyID0gImdjYy1uYXRpdmUiLApjcmF5cGUgPSAiY3JheXBlIiwKY3JheXBlX2NwdSA9ICJjcmF5cGUteDg2LW1pbGFuIiwKY3JheXBlX25ldHdvcmsgPSAiY3JheXBlLW5ldHdvcmstb2ZpIiwKZ2NjX2NvbXBpbGVyID0gImdjYy1uYXRpdmUiLApoZGY1ID0gImNyYXktaGRmNS1wYXJhbGxlbCIsCm1waSA9ICJjcmF5LW1waWNoIiwKcHl0aG9uID0gImNvbmRhIiwKfSwKbVQgPSB7ClsiUHJnRW52LWdudSJdID0gewpmbiA9ICIvb3B0L2NyYXkvcGUv +STARTED_AT=2025-07-16-003742 +LOADEDMODULES=libfabric/1.15.2.0:craype-network-ofi:perftools-base/23.12.0:darshan/3.4.4:xalt/3.0.2-202408282050:gcc-native/12.3:craype/2.7.30:cray-dsmml/0.2.2:cray-mpich/8.1.28:cray-pmi/6.1.13:cray-pals/1.3.4:cray-libpals/1.3.4:craype-x86-milan:PrgEnv-gnu/8.5.0:cray-hdf5-parallel/1.12.2.9:cudnn/9.1.0:conda/2024-04-29 +DAY=2025-07-16 +WEIGHT_SUM=0.0072042092147565125 +no_proxy=admin,polaris-adminvm-01,localhost,*.cm.polaris.alcf.anl.gov,polaris-*,*.polaris.alcf.anl.gov,*.alcf.anl.gov +CRAYPE_NETWORK_TARGET=ofi +__LMOD_REF_COUNT_MANPATH=/opt/cray/pals/1.3.4/man:2;/opt/cray/pe/pmi/6.1.13/man:1;/opt/cray/pe/mpich/8.1.28/ofi/man:1;/opt/cray/pe/mpich/8.1.28/man/mpich:1;/opt/cray/pe/dsmml/0.2.2/dsmml/man:1;/opt/cray/pe/craype/2.7.30/man:1;/opt/cray/pe/perftools/23.12.0/man:1;/opt/cray/pe/papi/7.0.1.2/share/pdoc/man:1;/opt/cray/libfabric/1.15.2.0/share/man:1;/usr/share/lmod/lmod/share/man:1;/usr/local/man:1;/usr/share/man:1;/usr/man:1;/opt/c3/man:1;/opt/pbs/share/man:1;/opt/clmgr/man:1;/opt/sgi/share/man:1;/opt/clmgr/share/man:1;/opt/clmgr/lib/cm-cli/man:1 +_ModuleTable006_=MDAwMDAwOC4wMDAwMDAwMDEuMDAwMDAwMDI4Lip6ZmluYWwiLAp9LApbImNyYXktcGFscyJdID0gewpmbiA9ICIvb3B0L2NyYXkvcGFscy9sbW9kL21vZHVsZWZpbGVzL2NvcmUvY3JheS1wYWxzLzEuMy40Lmx1YSIsCmZ1bGxOYW1lID0gImNyYXktcGFscy8xLjMuNCIsCmxvYWRPcmRlciA9IDExLApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMiwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gImNyYXktcGFscyIsCndWID0gIl4wMDAwMDAwMS4wMDAwMDAwMDMuMDAwMDAwMDA0Lip6ZmluYWwiLAp9LApbImNyYXktcG1pIl0gPSB7CmZuID0gIi9vcHQvY3JheS9wZS9sbW9kL21vZHVsZWZpbGVzL2NvcmUvY3JheS1wbWkvNi4xLjEzLmx1YSIsCmZ1bGxOYW1lID0gImNy +VIRTUAL_ENV_PROMPT=mconda3 +_ModuleTable003_=MDIwMjQuKnpmaW5hbC0uMDAwMDAwMDA0Lip6ZmluYWwtLjAwMDAwMDAyOS4qemZpbmFsIiwKfSwKWyJjcmF5LWRzbW1sIl0gPSB7CmZuID0gIi9vcHQvY3JheS9wZS9sbW9kL21vZHVsZWZpbGVzL2NvcmUvY3JheS1kc21tbC8wLjIuMi5sdWEiLApmdWxsTmFtZSA9ICJjcmF5LWRzbW1sLzAuMi4yIiwKbG9hZE9yZGVyID0gOCwKcHJvcFQgPSB7fSwKc3RhY2tEZXB0aCA9IDIsCnN0YXR1cyA9ICJhY3RpdmUiLAp1c2VyTmFtZSA9ICJjcmF5LWRzbW1sIiwKd1YgPSAiXjAwMDAwMDAwLjAwMDAwMDAwMi4wMDAwMDAwMDIuKnpmaW5hbCIsCn0sClsiY3JheS1oZGY1LXBhcmFsbGVsIl0gPSB7CmZuID0gIi9vcHQvY3JheS9wZS9sbW9kL21vZHVsZWZpbGVzL21waS9nbnUvMTIuMC9v +HDF5_DIR=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3 +LMOD_ROOT=/usr/share/lmod +YELLOW=\e[1;33m +CONDA_PROMPT_MODIFIER=(2024-04-29/base) +SSH_TTY=/dev/pts/243 +TRAIN_ITER=21 +PAT_RT_PERFCTR_DISABLE_COMPONENTS=nvml,rocm_smi +FROM_HEADER= +CRAY_MPICH_ROOTDIR=/opt/cray/pe/mpich/8.1.28 +_ModuleTable013_=MDMuMDAwMDAwMDAwLjAwMDAwMDAwMi4qemZpbmFsLS4yMDI0MDgyODIwNTAuKnpmaW5hbCIsCn0sCn0sCm1wYXRoQSA9IHsKIi9vcHQvY3JheS9wZS9sbW9kL21vZHVsZWZpbGVzL2hkZjUtcGFyYWxsZWwvZ251LzEyLjAvb2ZpLzEuMC9jcmF5LW1waWNoLzguMC9jcmF5LWhkZjUtcGFyYWxsZWwvMS4xMi4yIiwgIi9vcHQvY3JheS9wZS9sbW9kL21vZHVsZWZpbGVzL2NwdS94ODYtbWlsYW4vMS4wIgosICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9tcGkvZ251LzEyLjAvb2ZpLzEuMC9jcmF5LW1waWNoLzguMCIsICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jb21uZXQvZ251LzEyLjAvb2ZpLzEuMCIKLCAiL29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmls +MAIL=/var/spool/mail/akshita04 +BRIGHT_CYAN=\e[1;96m +HTTPS_PROXY=http://proxy.alcf.anl.gov:3128 +BRIGHT_GREEN=\e[1;92m +_ModuleTable016_=Y29yZTovb3B0L2NyYXkvbW9kdWxlZmlsZXM6L29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmlsZXMvY29yZTovb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jcmF5cGUtdGFyZ2V0cy9kZWZhdWx0Oi9zb2Z0L3BlcmZ0b29scy9kYXJzaGFuL2RhcnNoYW4tMy40LjQvc2hhcmUvY3JheXBlLTIueC9tb2R1bGVmaWxlczovc29mdC94YWx0L21vZHVsZWZpbGVzIiwKfQo= +CYAN=\e[1;36m +LESSKEY=/etc/lesskey.bin +_ModuleTable009_=LXg4Ni1taWxhbiIsCmxvYWRPcmRlciA9IDEzLApwcm9wVCA9IHt9LApzdGFja0RlcHRoID0gMiwKc3RhdHVzID0gImFjdGl2ZSIsCnVzZXJOYW1lID0gImNyYXlwZS14ODYtbWlsYW4iLAp3ViA9ICJNLip6ZmluYWwiLAp9LApjdWRubiA9IHsKZm4gPSAiL3NvZnQvbW9kdWxlZmlsZXMvY3Vkbm4vOS4xLjAubHVhIiwKZnVsbE5hbWUgPSAiY3Vkbm4vOS4xLjAiLApsb2FkT3JkZXIgPSAxNiwKcHJvcFQgPSB7fSwKcmVmX2NvdW50ID0gMSwKc3RhY2tEZXB0aCA9IDEsCnN0YXR1cyA9ICJhY3RpdmUiLAp1c2VyTmFtZSA9ICJjdWRubi85LjEuMCIsCndWID0gIjAwMDAwMDAwOS4wMDAwMDAwMDEuKnpmaW5hbCIsCn0sCmRhcnNoYW4gPSB7CmZuID0gIi9zb2Z0L3BlcmZ0b29scy9k +SHELL=/bin/bash +TERM=xterm-256color +PYTHON_EXEC=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/venvs/polaris/mconda3/bin/python3 +BRIGHT_RED=\e[1;91m +CRAY_MPICH_VER=8.1.28 +XDG_SESSION_CLASS=user +PE_MPICH_FIXED_PRGENV=GNU +PALS_TRANSFER=0 +LMOD_FAMILY_PRGENV=PrgEnv-gnu +MAMBA_ROOT_PREFIX=y +_ModuleTable_Sz_=16 +GLOBAL_BATCH_MAX=0 +FI_MR_CACHE_MONITOR=userfaultfd +XCURSOR_THEME=DMZ +LS_OPTIONS=-N --color=tty -T 0 +SP=1 +USE_PCM_DB=2 +LMOD_FAMILY_COMPILER=gcc-native +DB2INSTANCE=db2cat +CRAY_DSMML_BASEDIR=/opt/cray/pe/dsmml/0.2.2 +TMOUT=0 +PE_MPICH_GENCOMPILERS_GNU=12.3 +MAGENTA=\e[1;35m +DB2_HOME=/dbhome/db2cat/sqllib +SHLVL=2 +LANGUAGE=en_US.UTF-8 +GIT_BRANCH=lb-optimizers +YEAR=2025 +G_FILENAME_ENCODING=@locale,UTF-8,ISO-8859-15,CP1252 +PYTHONPATH=/soft/xalt/3.0.2-202408282050/site_packages +XALT_SAMPLING=no +MONTH=07 +DB2LIB=/dbhome/db2cat/sqllib/lib +BRIGHT_WHITE=\e[1;97m +PE_HDF5_PARALLEL_DIR=/opt/cray/pe/hdf5-parallel/1.12.2.9 +MANPATH=/opt/cray/pals/1.3.4/man:/opt/cray/pe/pmi/6.1.13/man:/opt/cray/pe/mpich/8.1.28/ofi/man:/opt/cray/pe/mpich/8.1.28/man/mpich:/opt/cray/pe/dsmml/0.2.2/dsmml/man:/opt/cray/pe/craype/2.7.30/man:/opt/cray/pe/perftools/23.12.0/man:/opt/cray/pe/papi/7.0.1.2/share/pdoc/man:/opt/cray/libfabric/1.15.2.0/share/man:/usr/share/lmod/lmod/share/man:/usr/local/man:/usr/share/man:/usr/man:/opt/c3/man:/opt/pbs/share/man:/opt/clmgr/man:/opt/sgi/share/man:/opt/clmgr/share/man:/opt/clmgr/lib/cm-cli/man +EXEC=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/pretrain_gpt_alcf.py +CRAY_DSMML_VERSION=0.2.2 +TRAIN_ITERS=21 +MPICH_DIR=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3 +OSCAR_HOME=/opt/oscar +PE_MPICH_GTL_LIBS_nvidia90=-lmpi_gtl_cuda +PE_HDF5_PARALLEL_PKGCONFIG_LIBS=hdf5_hl_parallel:hdf5_parallel +MODULEPATH=/opt/cray/pe/lmod/modulefiles/hdf5-parallel/gnu/12.0/ofi/1.0/cray-mpich/8.0/cray-hdf5-parallel/1.12.2:/opt/cray/pe/lmod/modulefiles/cpu/x86-milan/1.0:/opt/cray/pe/lmod/modulefiles/mpi/gnu/12.0/ofi/1.0/cray-mpich/8.0:/opt/cray/pe/lmod/modulefiles/comnet/gnu/12.0/ofi/1.0:/opt/cray/pe/lmod/modulefiles/mix_compilers:/opt/cray/pe/lmod/modulefiles/compiler/gnu/12.0:/soft/modulefiles:/opt/cray/pe/lmod/modulefiles/perftools/23.12.0:/opt/cray/pe/lmod/modulefiles/net/ofi/1.0:/usr/share/modulefiles/Linux:/usr/share/modulefiles/Core:/usr/share/lmod/lmod/modulefiles/Core:/usr/share/lmod/lmod/modulefiles:/opt/cray/pals/lmod/modulefiles/core:/opt/cray/modulefiles:/opt/cray/pe/lmod/modulefiles/core:/opt/cray/pe/lmod/modulefiles/craype-targets/default:/soft/perftools/darshan/darshan-3.4.4/share/craype-2.x/modulefiles:/soft/xalt/modulefiles +CONDA_NAME=mconda3 +CRAY_MPICH_DIR=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3 +XLA_FLAGS=--xla_gpu_force_compilation_parallelism=1 --xla_gpu_cuda_data_dir=/soft/compilers/cudatoolkit/cuda-12.4.1/ +NUM_DOCS=3 +LMOD_FAMILY_CRAYPE=craype +LOGNAME=akshita04 +CRAY_PMI_VERSION=6.1.13 +CRAY_MPICH_VERSION=8.1.28 +XDG_RUNTIME_DIR=/run/user/40709 +HOSTFILE= +LDFLAGS=-L/soft/applications/conda/2024-04-29/mconda3/lib -Wl,--enable-new-dtags,-rpath,/soft/applications/conda/2024-04-29/mconda3/lib +MODULEPATH_ROOT=/usr/share/modulefiles +DS_CONFIG=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/ds-configs/ds_stage1_mb2_gb0_pp1_fp16.json +PE_MPICH_GTL_LIBS_amd_gfx906=-lmpi_gtl_hsa +PYTHONUSERBASE=/home/akshita04/.local/polaris/conda/2024-04-29 +PE_DSMML_PKGCONFIG_LIBS=dsmml +XDG_CONFIG_DIRS=/etc/xdg +PATH=/lus/eagle/projects/datascience_collab/agupta/Megatron-DeepSpeed-MuP/venvs/polaris/mconda3/bin:/soft/applications/conda/2024-04-29/mconda3/bin:/soft/xalt/3.0.2-202408282050/bin:/soft/compilers/cudatoolkit/cuda-12.4.1/bin:/soft/libraries/nccl/nccl_2.21.5-1+cuda12.4_x86_64/include:/opt/cray/pe/hdf5-parallel/1.12.2.9/bin:/opt/cray/pe/hdf5/1.12.2.9/bin:/opt/cray/pals/1.3.4/bin:/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/bin:/opt/cray/pe/mpich/8.1.28/bin:/opt/cray/pe/craype/2.7.30/bin:y/condabin:/soft/perftools/darshan/darshan-3.4.4/bin:/opt/cray/pe/perftools/23.12.0/bin:/opt/cray/pe/papi/7.0.1.2/bin:/opt/cray/libfabric/1.15.2.0/bin:/opt/clmgr/sbin:/opt/clmgr/bin:/opt/sgi/sbin:/opt/sgi/bin:/home/akshita04/.local/bin:/usr/local/bin:/usr/bin:/bin:/opt/c3/bin:/dbhome/db2cat/sqllib/bin:/dbhome/db2cat/sqllib/adm:/dbhome/db2cat/sqllib/misc:/dbhome/db2cat/sqllib/gskit/bin:/usr/lib/mit/bin:/usr/lib/mit/sbin:/opt/pbs/bin:/sbin:/opt/cray/pe/bin +PE_MPICH_GTL_LIBS_amd_gfx908=-lmpi_gtl_hsa +_ModuleTable014_=ZXMvbWl4X2NvbXBpbGVycyIsICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jb21waWxlci9nbnUvMTIuMCIsICIvc29mdC9tb2R1bGVmaWxlcyIKLCAiL29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmlsZXMvcGVyZnRvb2xzLzIzLjEyLjAiLCAiL29wdC9jcmF5L3BlL2xtb2QvbW9kdWxlZmlsZXMvbmV0L29maS8xLjAiLCAiL3Vzci9zaGFyZS9tb2R1bGVmaWxlcy9MaW51eCIKLCAiL3Vzci9zaGFyZS9tb2R1bGVmaWxlcy9Db3JlIiwgIi91c3Ivc2hhcmUvbG1vZC9sbW9kL21vZHVsZWZpbGVzL0NvcmUiLCAiL3Vzci9zaGFyZS9sbW9kL2xtb2QvbW9kdWxlZmlsZXMiLCAiL29wdC9jcmF5L3BhbHMvbG1vZC9tb2R1bGVmaWxlcy9jb3JlIgosICIvb3B0L2NyYXkvbW9k +PE_MPICH_GTL_DIR_nvidia90=-L/opt/cray/pe/mpich/8.1.28/gtl/lib +__LMOD_REF_COUNT_PE_PRODUCT_LIST=CRAYPE_X86_MILAN:1;PERFTOOLS:1;CRAYPAT:1 +_LMFILES_=/opt/cray/modulefiles/libfabric/1.15.2.0:/opt/cray/pe/lmod/modulefiles/craype-targets/default/craype-network-ofi.lua:/opt/cray/pe/lmod/modulefiles/core/perftools-base/23.12.0.lua:/soft/perftools/darshan/darshan-3.4.4/share/craype-2.x/modulefiles/darshan/3.4.4:/soft/xalt/modulefiles/xalt/3.0.2-202408282050:/opt/cray/pe/lmod/modulefiles/core/gcc-native/12.3.lua:/opt/cray/pe/lmod/modulefiles/core/craype/2.7.30.lua:/opt/cray/pe/lmod/modulefiles/core/cray-dsmml/0.2.2.lua:/opt/cray/pe/lmod/modulefiles/comnet/gnu/12.0/ofi/1.0/cray-mpich/8.1.28.lua:/opt/cray/pe/lmod/modulefiles/core/cray-pmi/6.1.13.lua:/opt/cray/pals/lmod/modulefiles/core/cray-pals/1.3.4.lua:/opt/cray/pals/lmod/modulefiles/core/cray-libpals/1.3.4.lua:/opt/cray/pe/lmod/modulefiles/craype-targets/default/craype-x86-milan.lua:/opt/cray/pe/lmod/modulefiles/core/PrgEnv-gnu/8.5.0.lua:/opt/cray/pe/lmod/modulefiles/mpi/gnu/12.0/ofi/1.0/cray-mpich/8.0/cray-hdf5-parallel/1.12.2.9.lua:/soft/modulefiles/cudnn/9.1.0.lua:/soft/modulefiles/conda/2024-04-29.lua +BE=nccl +XALT_SCALAR_AND_SPSR_SAMPLING=yes +MICRO_BATCH=2 +MODULESHOME=/usr/share/lmod/lmod +LMOD_SETTARG_FULL_SUPPORT=no +PKG_CONFIG_PATH=/opt/cray/pe/hdf5-parallel/1.12.2.9/gnu/12.3/lib/pkgconfig:/opt/cray/pals/1.3.4/lib/pkgconfig:/opt/cray/pe/pmi/6.1.13/lib/pkgconfig:/opt/cray/pe/dsmml/0.2.2/dsmml/lib/pkgconfig:/opt/cray/pe/craype/2.7.30/pkg-config:/soft/perftools/darshan/darshan-3.4.4/lib/pkgconfig:/opt/cray/libfabric/1.15.2.0/lib64/pkgconfig +CONDA_DEFAULT_ENV=base +DEFAULT_LOG_LEVEL=INFO +G_BROKEN_FILENAMES=1 +_ModuleTable011_=YXR1cyA9ICJhY3RpdmUiLAp1c2VyTmFtZSA9ICJnY2MtbmF0aXZlIiwKd1YgPSAiXjAwMDAwMDEyLjAwMDAwMDAwMy4qemZpbmFsIiwKfSwKbGliZmFicmljID0gewpmbiA9ICIvb3B0L2NyYXkvbW9kdWxlZmlsZXMvbGliZmFicmljLzEuMTUuMi4wIiwKZnVsbE5hbWUgPSAibGliZmFicmljLzEuMTUuMi4wIiwKbG9hZE9yZGVyID0gMSwKcHJvcFQgPSB7fSwKc3RhY2tEZXB0aCA9IDEsCnN0YXR1cyA9ICJhY3RpdmUiLAp1c2VyTmFtZSA9ICJsaWJmYWJyaWMiLAp3ViA9ICJeMDAwMDAwMDEuMDAwMDAwMDE1LjAwMDAwMDAwMi4qemZpbmFsIiwKfSwKWyJwZXJmdG9vbHMtYmFzZSJdID0gewpmbiA9ICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jb3JlL3BlcmZ0b29s +PE_GNU_FIXED_PKGCONFIG_PATH=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3/lib/pkgconfig +GREEN=\e[1;32m +RESET=\e[0m +HISTSIZE=1000 +PE_MPICH_GTL_LIBS_amd_gfx90a=-lmpi_gtl_hsa +CRAYPE_DIR=/opt/cray/pe/craype/2.7.30 +LD_PRELOAD=/soft/xalt/3.0.2-202408282050/lib64/libxalt_init.so +LMOD_PKG=/usr/share/lmod/lmod +__LMOD_REF_COUNT_PE_MPICH_PKGCONFIG_LIBS=mpich:1 +BLUE=\e[1;34m +ftp_proxy=http://proxy.alcf.anl.gov:3128 +OFFLOAD_INIT=on_start +CPU=x86_64 +__LMOD_REF_COUNT_PE_PKGCONFIG_LIBS=hdf5_hl_parallel:1;hdf5_parallel:1;mpich:1;dsmml:1;darshan-runtime:1 +FLASH_ARG=--use-flash-attn-v2 +XML_CATALOG_FILES=file:///soft/applications/conda/2024-04-29/mconda3/etc/xml/catalog file:///etc/xml/catalog +ACT_CKPT_NUM_LAYERS=1 +CRAYPE_VERSION=2.7.30 +LMOD_CMD=/usr/share/lmod/lmod/libexec/lmod +_ModuleTable005_=aWJwYWxzLzEuMy40IiwKbG9hZE9yZGVyID0gMTIsCnByb3BUID0ge30sCnN0YWNrRGVwdGggPSAyLApzdGF0dXMgPSAiYWN0aXZlIiwKdXNlck5hbWUgPSAiY3JheS1saWJwYWxzIiwKd1YgPSAiXjAwMDAwMDAxLjAwMDAwMDAwMy4wMDAwMDAwMDQuKnpmaW5hbCIsCn0sClsiY3JheS1tcGljaCJdID0gewpmbiA9ICIvb3B0L2NyYXkvcGUvbG1vZC9tb2R1bGVmaWxlcy9jb21uZXQvZ251LzEyLjAvb2ZpLzEuMC9jcmF5LW1waWNoLzguMS4yOC5sdWEiLApmdWxsTmFtZSA9ICJjcmF5LW1waWNoLzguMS4yOCIsCmxvYWRPcmRlciA9IDksCnByb3BUID0ge30sCnN0YWNrRGVwdGggPSAyLApzdGF0dXMgPSAiYWN0aXZlIiwKdXNlck5hbWUgPSAiY3JheS1tcGljaCIsCndWID0gIl4w +CRAY_PMI_POST_LINK_OPTS=-L/opt/cray/pe/pmi/6.1.13/lib +CRAY_MPICH_PREFIX=/opt/cray/pe/mpich/8.1.28/ofi/gnu/12.3 +PE_GCC_EXTERNAL=native +DATA_CACHE_PATH=.cache/books/index-cache +LESSOPEN=lessopen.sh %s +CRAYPAT_OPTS_EXECUTABLE=libexec64/opts +LMOD_DIR=/usr/share/lmod/lmod/libexec +LMOD_FAMILY_MPI=cray-mpich +CUDA_TOOLKIT_BASE=/soft/compilers/cudatoolkit/cuda-12.4.1/ +BASH_FUNC___conda_hashr%%=() { if [ -n "${ZSH_VERSION:+x}" ]; then + \rehash; + else + if [ -n "${POSH_VERSION:+x}" ]; then + :; + else + \hash -r; + fi; + fi +} +BASH_FUNC_module%%=() { if [ -z "${LMOD_SH_DBG_ON+x}" ]; then + case "$-" in + *v*x*) + __lmod_sh_dbg='vx' + ;; + *v*) + __lmod_sh_dbg='v' + ;; + *x*) + __lmod_sh_dbg='x' + ;; + esac; + fi; + if [ -n "${__lmod_sh_dbg:-}" ]; then + set +$__lmod_sh_dbg; + echo "Shell debugging temporarily silenced: export LMOD_SH_DBG_ON=1 for Lmod's output" 1>&2; + fi; + eval "$($LMOD_CMD shell "$@")" && eval "$(${LMOD_SETTARG_CMD:-:} -s sh)"; + __lmod_my_status=$?; + if [ -n "${__lmod_sh_dbg:-}" ]; then + echo "Shell debugging restarted" 1>&2; + set -$__lmod_sh_dbg; + fi; + unset __lmod_sh_dbg; + return $__lmod_my_status +} +BASH_FUNC_conda%%=() { \local cmd="${1-__missing__}"; + case "$cmd" in + activate | deactivate) + __conda_activate "$@" + ;; + install | update | upgrade | remove | uninstall) + __conda_exe "$@" || \return; + __conda_reactivate + ;; + *) + __conda_exe "$@" + ;; + esac +} +BASH_FUNC___conda_activate%%=() { if [ -n "${CONDA_PS1_BACKUP:+x}" ]; then + PS1="$CONDA_PS1_BACKUP"; + \unset CONDA_PS1_BACKUP; + fi; + \local ask_conda; + ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix "$@")" || \return; + \eval "$ask_conda"; + __conda_hashr +} +BASH_FUNC___conda_reactivate%%=() { \local ask_conda; + ask_conda="$(PS1="${PS1:-}" __conda_exe shell.posix reactivate)" || \return; + \eval "$ask_conda"; + __conda_hashr +} +BASH_FUNC_ml%%=() { eval "$($LMOD_DIR/ml_cmd "$@")" +} +_=/usr/bin/printenv diff --git a/ALCF/helpers.sh b/ALCF/helpers.sh index 460820c1c84..feae4e7ccea 100644 --- a/ALCF/helpers.sh +++ b/ALCF/helpers.sh @@ -227,7 +227,7 @@ setup_run_cmd() { export MUP_BASE_WIDTH=${MUP_BASE_WIDTH:-256} export MUP_MUL=$(( $HIDDEN / $MUP_BASE_WIDTH )) mup_flags+=( - #"--enable-mup" + "--enable-mup" "--mup-coord-check=True" "--mup-hidden-weights-scale=${MUP_MUL}" "--mup-hidden-lr-scale=${MUP_MUL}" diff --git a/megatron/model/transformer.py b/megatron/model/transformer.py index f8c79843ebf..6eaec13a4de 100644 --- a/megatron/model/transformer.py +++ b/megatron/model/transformer.py @@ -116,7 +116,7 @@ def __init__(self, config, moe=False, enable_expert_tensor_parallelism=False): ### Begin MuP Code ### if config.enable_mup: - load_init_method = init_method_normal(( config.mup_hidden_weights_scale ** (-1) ) * config.init_method_std ) + load_init_method = init_method_normal(( config.mup_hidden_weights_scale ** (-1/2) ) * config.init_method_std ) else: load_init_method = config.init_method ### End MuP Code ### @@ -302,7 +302,7 @@ def forward(self, query_layer, key_layer, value_layer, attention_mask): # =================================== # Raw attention scores. [b, np, s, s] # =================================== - + # [b, np, sq, sk] output_size = ( query_layer.size(1), @@ -426,9 +426,10 @@ def __init__( self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout + args = get_args() # Use FlashAttention-2 when args.use_flash_attn_v2 is True - args = get_args() + #args = get_args() self.use_flash_attn_builder_v1 = False self.use_flash_attn_builder_v2 = False self.use_flash_attn = False @@ -503,6 +504,15 @@ def forward(self, q, k, v): ) dropout_p = 0 + ### Begin MuP Code ### + args = get_args() + + if args.enable_mup: + #print("---------------------------------- MUP FLASH ATTN -------------------------") + #print( type(self.softmax_scale) ) + self.softmax_scale = (args.hidden_size ** (-1)) * (args.depth_base ** (1/2)) + ### End MuP Code ### + if self.use_flash_attn: output = self.flash_attn_func( q, @@ -562,7 +572,8 @@ def __init__( self.causal = causal self.softmax_scale = softmax_scale self.dropout_p = attention_dropout - + + def forward(self, q, k, v): """Implements the multihead softmax attention. Arguments @@ -674,7 +685,7 @@ def __init__( ### Begin MuP Code ### if config.enable_mup: print("------------------SCALING VARIANCE OF ATTN----------------") - load_init_method = init_method_normal( (config.mup_hidden_weights_scale ** (-1)) * config.init_method_std) + load_init_method = init_method_normal( (config.mup_hidden_weights_scale ** (-1/2)) * config.init_method_std) else: load_init_method = config.init_method ### End MuP Code ### @@ -716,7 +727,7 @@ def __init__( ) elif self.use_flash_attn: local_attn = FlashSelfAttention( - causal=True, attention_dropout=config.attention_dropout + causal=True, attention_dropout=config.attention_dropout, softmax_scale=(args.hidden_size ** (-1)) * (args.depth_base ** (1/2)) ) else: local_attn = CoreAttention(self.layer_number, config, self.attn_mask_type) @@ -967,7 +978,7 @@ def forward( q_pos_emb = q_pos_emb[:sequence_end, :, :, :] k_pos_emb = k_pos_emb[:sequence_end, :, :, :] rotary_pos_emb = (q_pos_emb, k_pos_emb) - + # ================================== # core attention computation # ================================== @@ -981,6 +992,17 @@ def forward( # absolute positional embedding. # otherwise, only relative positional embedding takes effect # value_layer = apply_rotary_pos_emb(value_layer, k_pos_emb) + + ### Begin MuP Code ### + #args = get_args() + + #if args.enable_mup: + #print("------------------- QK rescaling ---------------------------") + #query_layer = (args.mup_hidden_weights_scale ** (-1/2)) * query_layer + #key_layer = (args.mup_hidden_weights_scale ** (-1/2)) * key_layer + + ### End MuP Code ### + if self.enable_ds_sequence_parallel: batch_dim_idx = 1 @@ -1041,7 +1063,7 @@ def forward( # ================= output, bias = self.dense(context_layer) - + return output, bias @@ -1501,6 +1523,11 @@ def forward( #if args.enable_mup: # attention_output = ( args.mup_hidden_weights_scale ** (-1) ) * attention_output + + residual_mult = 1.0 + if args.enable_depth_scale: + residual_mult = ( args.depth_multiplier ) ** (-args.depth_alpha) + ### End MuP Code ### # Residual connection. @@ -1523,17 +1550,17 @@ def forward( bias_dropout_add_func = get_bias_dropout_add(self.training) if attention_bias is not None: - attention_bias = attention_bias.expand_as(residual) + attention_bias = residual_mult * attention_bias.expand_as(residual) ### Changed here with self.bias_dropout_add_exec_handler(): layernorm_input = bias_dropout_add_func( - attention_output, attention_bias, residual, self.hidden_dropout - ) + residual_mult * attention_output, attention_bias, residual, self.hidden_dropout + ) ### Mup Change Here else: out = torch.nn.functional.dropout( - attention_output + attention_bias, + residual_mult * (attention_output + attention_bias), p=self.hidden_dropout, training=self.training, - ) + ) ### MuP Change Here layernorm_input = residual + self.drop_path(out) # Layer norm post the self attention. @@ -1600,11 +1627,11 @@ def forward( if self.drop_path is None: if mlp_bias is not None: - mlp_bias = mlp_bias.expand_as(residual) + mlp_bias = residual_mult * mlp_bias.expand_as(residual) ### Change here with self.bias_dropout_add_exec_handler(): output = bias_dropout_add_func( - mlp_output, mlp_bias, residual, self.hidden_dropout - ) + residual_mult * mlp_output, mlp_bias, residual, self.hidden_dropout + ) ### MuP Change Here # Jit compiled function creates 'view' tensor. This tensor # potentially gets saved in the MPU checkpoint function context, @@ -1618,7 +1645,7 @@ def forward( else: if mlp_bias is not None: - mlp_output = mlp_output + mlp_bias + mlp_output = residual_mult * (mlp_output + mlp_bias) ### MuP Change Here out = torch.nn.functional.dropout( mlp_output, p=self.hidden_dropout, training=self.training ) diff --git a/megatron/optimizer/__init__.py b/megatron/optimizer/__init__.py index 1869adffffc..a93eb53b53e 100644 --- a/megatron/optimizer/__init__.py +++ b/megatron/optimizer/__init__.py @@ -66,6 +66,17 @@ def get_param_groups( no_wd_no_scale_lr_depth_lr = [] wd_mup_wd_no_scale_lr_depth_mup_lr = [] + + mup_lr = 1.0 + mup_wd = 1.0 + depth_lr = 1.0 + + if args.enable_mup: + mup_lr = (args.mup_hidden_weights_scale) ** (-1) + mup_wd = args.mup_hidden_weights_scale + + if args.enable_depth_scale: + depth_lr = (args.depth_multiplier) ** (args.depth_alpha - 1) ### End MuP Code ### wd_no_scale_lr = [] @@ -74,6 +85,7 @@ def get_param_groups( no_wd_scale_lr = [] galore_params = [] target_modules_list = ["attn", "mlp"] + for module in modules: #print("-----------------AG DEBUG MODULE--------------------") #print(module) @@ -139,9 +151,9 @@ def get_param_groups( if len(no_wd_scale_lr): param_groups.append({'name': 'no_wd_scale_lr', 'params': no_wd_scale_lr, 'wd_mult': 0.0, 'lr_mult': lr_mult, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) if len(no_wd_no_scale_lr_depth_lr): - param_groups.append({'name': 'no_wd_no_scale_lr_depth_lr', 'params': no_wd_no_scale_lr_depth_lr, 'wd_mult': 0.0, 'lr_mult': 1.0, '_mup_lr_mult':1.0, '_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), '_mup_wd_mult':1.0, '_depth_wd_mult':1.0}) + param_groups.append({'name': 'no_wd_no_scale_lr_depth_lr', 'params': no_wd_no_scale_lr_depth_lr, 'wd_mult': 0.0, 'lr_mult': 1.0, '_mup_lr_mult':mup_lr, '_depth_lr_mult':depth_lr, '_mup_wd_mult':mup_wd, '_depth_wd_mult':1.0}) if len(wd_mup_wd_no_scale_lr_depth_mup_lr): - param_groups.append({'name': 'wd_mup_wd_no_scale_lr_depth_mup_lr', 'params': wd_mup_wd_no_scale_lr_depth_mup_lr, 'wd_mult': 1.0, 'lr_mult': 1.0, '_mup_lr_mult':1.0, '_depth_lr_mult':1.0, '_mup_wd_mult':args.mup_hidden_weights_scale, '_depth_wd_mult':1.0}) ### '_mup_lr_mult':args.mup_hidden_lr_scale ** (-1),'_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), + param_groups.append({'name': 'wd_mup_wd_no_scale_lr_depth_mup_lr', 'params': wd_mup_wd_no_scale_lr_depth_mup_lr, 'wd_mult': 1.0, 'lr_mult': 1.0, '_mup_lr_mult': mup_lr, '_depth_lr_mult': depth_lr, '_mup_wd_mult':mup_wd, '_depth_wd_mult':1.0}) ### '_mup_lr_mult':args.mup_hidden_lr_scale ** (-1),'_depth_lr_mult':args.depth_multiplier ** (args.depth_alpha - 1), return param_groups @@ -173,10 +185,13 @@ def get_megatron_optimizer( optimizer = None ### Begin MuP Code ### -- args is a mutable object - if args.enable_mup: - args.adam_eps = args.adam_eps * (args.mup_hidden_lr_scale ** (-1)) - elif args.enable_mup and args.enable_depth_scale: + if args.enable_mup and args.enable_depth_scale: args.adam_eps = args.adam_eps * (args.mup_hidden_lr_scale ** (-1)) * (args.depth_multiplier ** (-args.depth_alpha)) + elif args.enable_mup: + args.adam_eps = args.adam_eps * (args.mup_hidden_lr_scale ** (-1)) + elif args.enable_depth_scale: + args.adam_eps = args.adam_eps * (args.depth_multiplier ** (-args.depth_alpha)) + ### End MuP Code ### diff --git a/megatron/training.py b/megatron/training.py index 1b87eb3c5f6..bfc42599dd9 100644 --- a/megatron/training.py +++ b/megatron/training.py @@ -1331,38 +1331,11 @@ def train( params_norm = calc_params_l2_norm(model) ### Begin MuP Code ### - if args.enable_mup and args.mup_coord_check: + if (args.enable_mup or args.enable_depth_scale) and args.mup_coord_check: if iteration <=20: mup_coord_check(model) - - ''' - temp_list = [] - for name, params in model[0].named_parameters(): - - if 'word_embeddings' in name: - print("-------------------------------------------") - print(name) - print( (params.data.float()).abs().mean().item() ) - temp_list.append((params.data.float()).abs().mean().item()) - elif '8.mlp.dense_4h_to' in name: - print("-------------------------------------------") - print(name) - print( (params.data.float()).abs().mean().item() ) - temp_list.append((params.data.float()).abs().mean().item()) - elif 'output_layer' in name: - print("-------------------------------------------") - print(name) - print( (params.data.float()).abs().mean().item() ) - temp_list.append((params.data.float()).abs().mean().item()) - - print(temp_list) - with open("mup_coord_check/adamw_h2048_s123456.txt", "a") as file: - - for item in temp_list: - file.write( "%s " % item ) - file.write('\n') - ''' ### End MuP Code ### + report_memory_flag = training_log( loss_dict, total_loss_dict, diff --git a/megatron/utils.py b/megatron/utils.py index 9529728a8db..dbd2975867b 100644 --- a/megatron/utils.py +++ b/megatron/utils.py @@ -144,7 +144,7 @@ def mup_coord_check(model): print(name) print( (params.data.float()).abs().mean().item() ) temp_list.append((params.data.float()).abs().mean().item()) - elif 'layers.9.' in name: + elif 'layers.4.' in name: print("-------------------------------------------") print(name) print( (params.data.float()).abs().mean().item() ) @@ -157,7 +157,8 @@ def mup_coord_check(model): print(temp_list) - file_name = f"adamw_hidden{args.hidden_size}_ffn{args.ffn_hidden_size}_depth{args.num_layers}_s1234_v3.txt" + file_name = f"adamw_hidden{args.hidden_size}_ffn{args.ffn_hidden_size}_depth{args.num_layers}_s1234.txt" + #file_name = f"test_file.txt" with open(f"mup_coord_check/{file_name}", "a") as file: for item in temp_list: file.write( "%s " % item )