unslothai · pluesclues · Nov 7, 2025 · Nov 11, 2025 · Nov 12, 2025 · Nov 12, 2025
diff --git a/unsloth/models/rl.py b/unsloth/models/rl.py
@@ -209,6 +209,7 @@ def unsloth_prediction_step(
     exec(f"Trainer.prediction_step=unsloth_prediction_step")
 
 
+grpo_selective_log_softmax = RL_REPLACEMENTS["grpo_selective_log_softmax"]
 selective_log_softmax = RL_REPLACEMENTS["selective_log_softmax"]
 calculate_pad_tokens_in_prompt = RL_REPLACEMENTS["calculate_pad_tokens_in_prompt"]
 create_completion_attention_mask = RL_REPLACEMENTS["create_completion_attention_mask"]
@@ -253,6 +254,7 @@ def wrapper(self, *args, **kwargs):
     "triton.cudagraphs" : False,
 }}
 
+{grpo_selective_log_softmax_code}
 {selective_log_softmax_code}
 {calculate_pad_tokens_in_prompt_code}
 {create_completion_attention_mask_code}
@@ -909,6 +911,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
 
     # Selective log softmax and other functions
     selective_log_softmax_code = inspect.getsource(selective_log_softmax)
+    grpo_selective_log_softmax_code = inspect.getsource(grpo_selective_log_softmax)
     calculate_pad_tokens_in_prompt_code = inspect.getsource(
         calculate_pad_tokens_in_prompt
     )
@@ -938,6 +941,7 @@ def _patch_trl_rl_trainers(trainer_file = "grpo_trainer"):
         max_seq_length_call = max_seq_length_call,
         max_seq_length_post = max_seq_length_post,
         selective_log_softmax_code = selective_log_softmax_code,
+        grpo_selective_log_softmax_code = grpo_selective_log_softmax_code,
         calculate_pad_tokens_in_prompt_code = calculate_pad_tokens_in_prompt_code,
         create_completion_attention_mask_code = create_completion_attention_mask_code,
         left_pack_padding_code = left_pack_padding_code,

diff --git a/unsloth/models/rl_replacements.py b/unsloth/models/rl_replacements.py
@@ -231,18 +231,20 @@ def grpo_trainer__generate_and_score_completions(function_name, function):
 
     # The new multi-line string that will replace the line above
     replacement_lines = """
+        max_left_pad = None
         batch_size = self.args.per_device_train_batch_size if mode == "train" else self.args.per_device_eval_batch_size
         try:
             # TRL 0.23.1 and below path
             if not has_images:
                 # Left pad prompt before calculation old and ref hidden states
-                prompt_completion_ids = left_pack_padding(prompt_completion_ids, self.processing_class.pad_token_id)
-            self.model.for_training()
+                left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id)
+                max_left_pad = max(left_pad_tokens_per_prompt).item()
         except:
             # TRL 0.24.0 and below path
             if images is None:
                 # Left pad prompt before calculation old and ref hidden states
-                prompt_completion_ids = left_pack_padding(prompt_completion_ids, self.processing_class.pad_token_id)
+                left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(prompt_completion_ids, logits_to_keep, self.processing_class.pad_token_id)
+                max_left_pad = max(left_pad_tokens_per_prompt).item()
         self.model.for_training()"""
 
     function = function.replace(line_to_replace, replacement_lines)
@@ -319,18 +321,27 @@ def grpo_trainer__generate_and_score_completions(function_name, function):
         if self.use_vllm:"""
             function = function.replace(replace_part, new_replacement)
 
+    # Important note: we disable TRL's importance sampling logic
+    string_to_find = "if self.use_vllm and self.vllm_importance_sampling_correction:"
+
+    replacement_string = (
+        "if False and self.use_vllm and self.vllm_importance_sampling_correction:"
+    )
+
+    function = function.replace(string_to_find, replacement_string)
+
     string_to_find = """        if "image_sizes" in prompt_inputs:
             output["image_sizes"] = prompt_inputs["image_sizes"]"""
 
     replacement_string = """        if "image_sizes" in prompt_inputs:
             output["image_sizes"] = prompt_inputs["image_sizes"]
-        
-        if self.use_vllm:
-            try:
+        if max_left_pad is not None:
+            output["max_left_pad"] = torch.tensor(sampling_per_token_logps.shape[0] * [max_left_pad]).unsqueeze(-1)        
+        try:
+            if self.use_vllm and getattr(self, "vllm_importance_sampling_correction", False):
                 output["sampling_per_token_logps"] = sampling_per_token_logps
-            except NameError:
-                output["sampling_per_token_logps"] = None"""
-
+        except NameError:
+            output["sampling_per_token_logps"] = None"""
     function = function.replace(string_to_find, replacement_string)
 
     if "wake_up()" not in function:
@@ -510,7 +521,6 @@ def _get_per_token_logps_and_entropies(
         if compute_efficient:
             return None, None
         else:
-            # Otherwise, calculate normally:
             if not hasattr(self, "_autocast_dtype"):
                 self._autocast_dtype = (
                     torch.float16
@@ -529,47 +539,156 @@ def _get_per_token_logps_and_entropies(
                 kwargs.get("image_sizes", None),
             )
 
-            os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
-
             unwrapped_model = self.accelerator.unwrap_model(
                 model, keep_fp32_wrapper = False
             )
 
+            B = input_ids.shape[0]
+            all_logprobs_list = []
+
+            if pixel_values is None:
+                left_pad_tokens_per_prompt = calculate_pad_tokens_in_prompt(
+                    input_ids, logits_to_keep, self.processing_class.pad_token_id
+                )
+                max_left_pad = max(left_pad_tokens_per_prompt).item()
+                input_ids = left_pack_padding(
+                    input_ids, self.processing_class.pad_token_id
+                )
+                attention_mask = input_ids != self.processing_class.pad_token_id
+                attention_mask = attention_mask.to(attention_mask.dtype)
+            else:
+                max_left_pad = 0
+
+            input_ids_chunks = torch.chunk(input_ids, chunks = B, dim = 0)
+            attention_mask_chunks = torch.chunk(attention_mask, chunks = B, dim = 0)
+
+            def chunk_optional(tensor, chunks):
+                if tensor is None:
+                    return [None] * chunks
+                return torch.chunk(tensor, chunks = chunks, dim = 0)
+
+            pixel_values_chunks = [None] * B
+            image_grid_thw_chunks = [None] * B
+            pixel_attention_mask_chunks = [None] * B
+
+            # This is the chunkng logit from trl 0.23.0
+            if image_grid_thw is not None and pixel_values is not None:
+                if image_grid_thw.shape[0] != B:
+                    raise ValueError(
+                        f"This logic requires image_grid_thw.shape[0] ({image_grid_thw.shape[0]}) "
+                        f"to be equal to batch size B ({B})."
+                    )
+
+                rows_per_sample = image_grid_thw.prod(dim = -1)
+                rows_per_sample_list = rows_per_sample.cpu().tolist()
+
+                pixel_values_chunks = list(
+                    torch.split(pixel_values, rows_per_sample_list, dim = 0)
+                )
+                if pixel_attention_mask is not None:
+                    pixel_attention_mask_chunks = list(
+                        torch.split(pixel_attention_mask, rows_per_sample_list, dim = 0)
+                    )
+
+                image_grid_thw_chunks = list(
+                    torch.chunk(image_grid_thw, chunks = B, dim = 0)
+                )
+
+            elif pixel_values is not None:
+                pixel_values_chunks = list(torch.chunk(pixel_values, chunks = B, dim = 0))
+                if pixel_attention_mask is not None:
+                    pixel_attention_mask_chunks = list(
+                        torch.chunk(pixel_attention_mask, chunks = B, dim = 0)
+                    )
+
+            if image_sizes is not None and not isinstance(image_sizes, torch.Tensor):
+                image_sizes_chunks = [[size] for size in image_sizes]
+            else:
+                image_sizes_chunks = chunk_optional(image_sizes, B)
+
+            lm_head = self.model.get_output_embeddings().weight
+            temperature = self.temperature
+            logit_softcapping = getattr(model.config, "final_logit_softcapping", 0)
+            if logit_softcapping is None:
+                logit_softcapping = 0
+            logit_scale_multiply = getattr(model.config, "logit_scale", 0)
+            if logit_scale_multiply is None:
+                logit_scale_multiply = 0
+            logit_scale_divide = getattr(model.config, "logits_scaling", 0)
+            if logit_scale_divide is None:
+                logit_scale_divide = 0
+
+            zipped_inputs = zip(
+                input_ids_chunks,
+                attention_mask_chunks,
+                pixel_values_chunks,
+                image_grid_thw_chunks,
+                pixel_attention_mask_chunks,
+                image_sizes_chunks,
+            )
+            os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "1"
+
             with torch.amp.autocast(device_type = "cuda", dtype = self._autocast_dtype):
-                with torch.inference_mode():
-                    if pixel_values is None:
-                        attention_mask = input_ids != self.processing_class.pad_token_id
-                        attention_mask = attention_mask.to(attention_mask.dtype)
-                        # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
-                        logits = unwrapped_model(
-                            input_ids = input_ids,
-                            attention_mask = attention_mask,
-                            pixel_values = pixel_values,
-                            image_grid_thw = image_grid_thw,
-                            pixel_attention_mask = pixel_attention_mask,
-                            image_sizes = image_sizes,
-                            # logits_to_keep = logits_to_keep + 1,
-                        ).logits
-                    else:
-                        logits = unwrapped_model(
-                            input_ids = input_ids,
-                            attention_mask = attention_mask,
-                            pixel_values = pixel_values,
-                            image_grid_thw = image_grid_thw,
-                            pixel_attention_mask = pixel_attention_mask,
-                            image_sizes = image_sizes,
-                            logits_to_keep = logits_to_keep + 1,
-                        ).logits
-
-                entropies = None
-                if compute_entropy:
-                    from trl.trainer.utils import entropy_from_logits
-
-                    entropies = entropy_from_logits(logits)
+                with torch.no_grad():
+                    for (
+                        input_ids_chunk,
+                        attention_mask_chunk,
+                        pixel_values_chunk,
+                        image_grid_thw_chunk,
+                        pixel_attention_mask_chunk,
+                        image_sizes_chunk,
+                    ) in zipped_inputs:
+                        if pixel_values is None:
+                            logits_chunk = unwrapped_model(
+                                input_ids = input_ids_chunk,
+                                attention_mask = attention_mask_chunk,
+                                pixel_values = pixel_values_chunk,
+                                image_grid_thw = image_grid_thw_chunk,
+                                pixel_attention_mask = pixel_attention_mask_chunk,
+                                image_sizes = image_sizes_chunk,
+                            ).logits
+
+                            completion_input_ids_chunk = input_ids_chunk[
+                                :, -(logits_to_keep + max_left_pad) :
+                            ]
+                            logits_chunk = logits_chunk[
+                                :, -(logits_to_keep + max_left_pad + 1) :, :
+                            ]
+                            logits_chunk = logits_chunk[:, :-1, :]
+                        else:
+                            logits_chunk = unwrapped_model(
+                                input_ids = input_ids_chunk,
+                                attention_mask = attention_mask_chunk,
+                                pixel_values = pixel_values_chunk,
+                                image_grid_thw = image_grid_thw_chunk,
+                                pixel_attention_mask = pixel_attention_mask_chunk,
+                                image_sizes = image_sizes_chunk,
+                                logits_to_keep = logits_to_keep + 1,
+                            ).logits
+
+                            logits_chunk = logits_chunk[:, :-1, :]
+                            completion_input_ids_chunk = input_ids_chunk[
+                                :, -logits_to_keep:
+                            ]
+                        # breakpoint()
+                        logprobs_chunk = chunked_hidden_states_selective_log_softmax(
+                            logits_chunk,
+                            lm_head,
+                            completion_input_ids_chunk,
+                            chunks = 8,
+                            logit_scale_multiply = logit_scale_multiply,
+                            logit_scale_divide = logit_scale_divide,
+                            logit_softcapping = logit_softcapping,
+                            temperature = temperature,
+                        )
+
+                        all_logprobs_list.append(logprobs_chunk)
+                    logprobs = torch.cat(all_logprobs_list, dim = 0)
+                    entropies = None
 
             os.environ["UNSLOTH_RETURN_HIDDEN_STATES"] = "0"
-            # logits = logits[:, :-1, :]  # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
-            return logits.detach(), entropies  # logps, entropies
+
+            return logprobs.detach(), entropies  # logps, entropies
             # input_ids = input_ids[:, -logits_to_keep:]
             # For transformers<=4.48, logits_to_keep argument isn't supported, so here we drop logits ourselves.
             # See https://github.com/huggingface/trl/issues/2770
@@ -678,14 +797,14 @@ def compute_loss(
         #         ref_per_token_logps = per_token_logps = get_logps_func(model, input_ids, attention_mask, logits_to_keep)
         # else:
         #     ref_per_token_logps = None
-        ref_hidden_states = inputs.get("ref_per_token_logps", None)
+        ref_logps = inputs.get("ref_per_token_logps", None)
         # per_token_kl = torch.exp(ref_per_token_logps - per_token_logps) - (ref_per_token_logps - per_token_logps) - 1
         # x - x.detach() allows for preserving gradients from x
         advantages = inputs["advantages"]
         # per_token_loss = torch.exp(per_token_logps - per_token_logps.detach()) * advantages.unsqueeze(1)
         # per_token_loss = -(per_token_loss - self.beta * per_token_kl)
         # loss = ((per_token_loss * completion_mask).sum(dim=1) / completion_mask.sum(dim=1)).mean()
-        old_hidden_states = inputs.get("old_per_token_logps", None)
+        old_logps = inputs.get("old_per_token_logps", None)
 
         input_ids = input_ids[:, -logits_to_keep:]
 
@@ -700,6 +819,7 @@ def compute_loss(
         if logit_scale_divide is None:
             logit_scale_divide = 0
 
+        max_left_pad = inputs.get("max_left_pad", 0)
         if per_token_logps is not None:
             if ref_hidden_states is not None:
                 ref_hidden_states = ref_hidden_states[
@@ -731,6 +851,7 @@ def compute_loss(
                     max_completion_length = self.args.max_completion_length,
                     delta = self.args.delta,
                     temperature = self.args.temperature,
+                    max_left_pad = max_left_pad,
                     logit_softcapping = logit_softcapping,
                     logit_scale_multiply = logit_scale_multiply,
                     logit_scale_divide = logit_scale_divide,
@@ -751,8 +872,8 @@ def compute_loss(
                         logits_to_keep = logits_to_keep,
                         completion_mask = completion_mask,
                         advantages = advantages,
-                        old_hidden_states = old_hidden_states,
-                        ref_hidden_states = ref_hidden_states,
+                        old_logps = old_logps,
+                        ref_logps = ref_logps,
                         n_chunks = self.args.unsloth_num_chunks,
                         loss_type = self.args.loss_type,
                         importance_sampling_level = self.importance_sampling_level,
@@ -761,6 +882,7 @@ def compute_loss(
                         max_completion_length = self.args.max_completion_length,
                         delta = self.args.delta,
                         temperature = self.args.temperature,
+                        max_left_pad = max_left_pad,
                         logit_softcapping = logit_softcapping,
                         logit_scale_multiply = logit_scale_multiply,
                         logit_scale_divide = logit_scale_divide,
@@ -797,7 +919,11 @@ def compute_loss(
             self._metrics["completion_length"].append(completion_length.item())
             self._metrics["kl"].append(mean_kl.item())
 
-        if self.use_vllm and delta is not None:
+        if (
+            self.use_vllm
+            and delta is not None
+            and getattr(self, "vllm_importance_sampling_correction", False)
+        ):
             mean_delta = (
                 torch.mean(delta)
                 if delta.numel() > 0