From cbfb9d078e78b231baebdce185d262c08d88fe2e Mon Sep 17 00:00:00 2001 From: leejianwoo-collab Date: Sat, 13 Dec 2025 21:53:58 -0500 Subject: [PATCH] fix: correct margin calculation in DPO training --- tinker_cookbook/preference/train_dpo.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tinker_cookbook/preference/train_dpo.py b/tinker_cookbook/preference/train_dpo.py index 285c9618..d4ffafc7 100644 --- a/tinker_cookbook/preference/train_dpo.py +++ b/tinker_cookbook/preference/train_dpo.py @@ -140,7 +140,7 @@ def compute_dpo_loss( accuracy = (chosen_log_ratio > rejected_log_ratio).float().mean().item() chosen_rewards = dpo_beta * chosen_log_ratio rejected_rewards = dpo_beta * rejected_log_ratio - margin = dpo_beta * (chosen_rewards - rejected_rewards).mean().item() + margin = (chosen_rewards - rejected_rewards).mean().item() metrics = { "dpo_loss": loss.item(), @@ -394,3 +394,4 @@ def print_example(datum: tinker.Datum, tokenizer: Tokenizer, label: str = ""): weights = datum.loss_fn_inputs["weights"].data logger.info(f"\n{label} Example:") logger.info(format_colorized(int_tokens, cast(list[float], weights), tokenizer)) +