Allow KD loss during eval

AAnoosheh · AAnoosheh · commit c1ab42be9099 · 2025-11-19T09:20:18.000-08:00
Signed-off-by: Asha Anoosheh &lt;aanoosheh@nvidia.com&gt;
diff --git a/megatron/post_training/loss_func.py b/megatron/post_training/loss_func.py
@@ -55,16 +55,18 @@ def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor, model: GPTMo
     num_tokens = loss_mask.sum().clone().detach().to(torch.int)
     report = {'lm loss': torch.cat([loss_lm.clone().detach().view(1), num_tokens.view(1)])}
 
-    if model.training and args.export_kd_teacher_load:
+    if args.export_kd_teacher_load:
         # [ModelOpt]: Handle knowledge distillation
         losses = model.compute_kd_loss(
             student_loss=loss_lm,
             loss_reduction_fn=lambda x: _mask_loss(x, loss_mask),
         )
-        loss = losses["kd_loss"]
 
         report["total loss"] = torch.cat([losses["kd_loss"].clone().detach().view(1), num_tokens.view(1)])
         report["logits distillation loss"] = torch.cat([losses["logits_loss"].clone().detach().view(1), num_tokens.view(1)])
         report["intermediate distillation loss"] = torch.cat([losses["intermediate_loss"].clone().detach().view(1), num_tokens.view(1)])
 
+        if model.training:
+            loss = losses["kd_loss"]
+
     return loss, num_tokens, report
diff --git a/megatron/training/training.py b/megatron/training/training.py
@@ -1300,7 +1300,10 @@ def train_step(forward_step_func, data_iterator, model, optimizer, opt_param_sch
         if has_nvidia_modelopt:
             # [ModelOpt]: Pipeline-parallel Distillation stacks student and teacher tensors
             adjust_tensor_shapes_fn = get_tensor_shapes_adjust_fn_for_distillation(
-                model, args.seq_length, args.micro_batch_size, args.decoder_seq_length
+                model,
+                seq_length=args.seq_length,
+                micro_batch_size=args.micro_batch_size,
+                decoder_seq_length=args.decoder_seq_length,
             )
         else:
             adjust_tensor_shapes_fn = None