[https://nvbugs/5661877][fix] fix test regression in TestBatchedSampling::test_samples (#9215)

ixlmar · web-flow · commit 46dd9886bbd5 · 2025-11-19T01:44:44.000-08:00
Signed-off-by: ixlmar &lt;206748156+ixlmar@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -278,11 +278,14 @@ def _group_requests_by_strategy_key(
     group_dict: dict[tuple[GenericStrategyKeyType, bool], tuple[list[int], list[Strategy]]] = (
         defaultdict(lambda: ([], []))
     )
+
     for req_index, req in enumerate(requests):
         strategy = _request_strategy(req, vocab_size=vocab_size)
-        # In the overlap path, py_draft_logits is not updated yet,
-        # so we use get_draft_token_length() for the checking.
-        speculation_needs_probs = get_draft_token_length(req) > 0 and strategy is not GREEDY
+        speculation_needs_probs = (
+            # NB: This criterion needs to be consistent with the gating of rejection sampling in
+            #     process_draft_tokens.
+            TorchSampler._speculation_could_use_rejection_sampling(req, strategy)
+        )
         strategy_key = strategy_to_key(strategy, speculation_needs_probs)
         group_dict_entry = group_dict[(strategy_key, speculation_needs_probs)]
         group_dict_entry[0].append(req_index)
@@ -1026,6 +1029,17 @@ def _process_draft_tokens_rejection_sampling(
 
         return num_accepted
 
+    @staticmethod
+    def _speculation_could_use_rejection_sampling(
+        request: LlmRequest, strategy: Optional[Strategy] = None
+    ) -> bool:
+        if strategy is None:
+            strategy = _request_strategy(
+                request,
+                vocab_size=2**31,  # vocab_size does not affect greediness
+            )
+        return get_draft_token_length(request) > 0 and strategy != GREEDY
+
     def process_draft_tokens(
         self,
         request: LlmRequest,
@@ -1034,9 +1048,17 @@ def process_draft_tokens(
         finish_reasons: FinishReasonsList,
         resource_manager: Optional[ResourceManager] = None,
     ) -> int:
-        if (
-            _request_strategy(request, vocab_size=2**31) == GREEDY
-            or request.py_draft_logits is None
+        if not (
+            self._speculation_could_use_rejection_sampling(request)
+            # NB: '_speculation_could_use_rejection_sampling' is called in sample_async, which precludes
+            #     inspection of .py_draft_logits, because it is not set yet when the overlap path
+            #     is used.
+            #
+            #     OTOH, some drafters (e.g. NGram) do not provide draft logits, precluding rejection
+            #     sampling. The current solution accepts that .py_target_probs may sometimes be
+            #     computed, even though .py_draft_logits may never be set and the target probs
+            #     may ultimately not be required.
+            and request.py_draft_logits is not None
         ):
             spec_tree_manager = self.get_spec_tree_manager(resource_manager)
             if spec_tree_manager is not None:
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -390,7 +390,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5655832)
 examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4] SKIP (https://nvbugs/5655832)
 disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5661926)
-unittest/_torch/sampler/test_torch_sampler.py::TestBatchedSampling SKIP (https://nvbugs/5661877)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5568836)
 test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825)
 unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904)
diff --git a/tests/unittest/_torch/sampler/test_torch_sampler.py b/tests/unittest/_torch/sampler/test_torch_sampler.py
@@ -945,15 +945,11 @@ def seq_slot_assignment(
     def mock_requests(
         self,
         sampling_params_list: list[SamplingParams],
-        with_draft_logits: bool,
-        vocab_size: int,
         seq_slot_assignment: tuple[list[int], int],
         draft_lens: list[int],
     ) -> ScheduledRequests:
         return self._build_mock_requests(
             sampling_params_list=sampling_params_list,
-            with_draft_logits=with_draft_logits,
-            vocab_size=vocab_size,
             seq_slot_assignment=seq_slot_assignment,
             draft_lens=draft_lens,
         )
@@ -962,8 +958,6 @@ def _build_mock_requests(
         self,
         sampling_params_list: list[SamplingParams],
         *,
-        with_draft_logits: bool,
-        vocab_size: int,
         seq_slot_assignment: tuple[list[int], int],
         draft_lens: list[int],
     ) -> ScheduledRequests:
@@ -975,21 +969,9 @@ def __init__(
                 self,
                 sampling_params_list: list[SamplingParams],
                 *,
-                with_draft_logits: bool,
                 draft_lens: list[int],
             ):
                 self._sampling_params_list = sampling_params_list
-                self._with_draft_logits = with_draft_logits
-
-                def _attach_draft_logits(req: LlmRequest) -> LlmRequest:
-                    draft_len = len(req.py_draft_tokens)
-                    if draft_len and with_draft_logits:
-                        req.py_draft_logits = torch.testing.make_tensor(  # type: ignore
-                            (draft_len, vocab_size),
-                            dtype=torch.float32,
-                            device="cuda",
-                        )
-                    return req
 
                 # NB:
                 #   -  stop words are tested in test_write_finish_reasons
@@ -999,24 +981,22 @@ def _attach_draft_logits(req: LlmRequest) -> LlmRequest:
                 #   -  py_return_log_probs is tested elsewhere
                 #   -  code paths gated by py_return_context_logits tested in test_select_generated_logits
                 self._gen_requests = [
-                    _attach_draft_logits(
-                        LlmRequest(
-                            request_id=seq_slot,
-                            max_new_tokens=(2 * draft_len),  # not used by tested code
-                            input_tokens=[12],  # not used by tested code
-                            sampling_config=SamplingConfig(sampling_params._get_sampling_config()),
-                            seq_slot=seq_slot,
-                            is_streaming=False,  # not relevant for tested code
-                            draft_tokens=(  # 'len(.py_draft_tokens)' is inspected by get_draft_token_length
-                                torch.testing.make_tensor(
-                                    (draft_len,),
-                                    dtype=torch.int32,
-                                    device="cpu",
-                                ).tolist()
-                                if draft_len
-                                else None
-                            ),
-                        )
+                    LlmRequest(
+                        request_id=seq_slot,
+                        max_new_tokens=(2 * draft_len),  # not used by tested code
+                        input_tokens=[12],  # not used by tested code
+                        sampling_config=SamplingConfig(sampling_params._get_sampling_config()),
+                        seq_slot=seq_slot,
+                        is_streaming=False,  # not relevant for tested code
+                        draft_tokens=(  # 'len(.py_draft_tokens)' is inspected by get_draft_token_length
+                            torch.testing.make_tensor(
+                                (draft_len,),
+                                dtype=torch.int32,
+                                device="cpu",
+                            ).tolist()
+                            if draft_len
+                            else None
+                        ),
                     )
                     for sampling_params, seq_slot, draft_len in zip(
                         sampling_params_list, seq_slots, draft_lens
@@ -1040,9 +1020,7 @@ def all_requests(self) -> list[LlmRequest]:
         with torch.inference_mode(True):
             return cast(
                 ScheduledRequests,
-                ScheduledRequestsMock(
-                    sampling_params_list, with_draft_logits=with_draft_logits, draft_lens=draft_lens
-                ),
+                ScheduledRequestsMock(sampling_params_list, draft_lens=draft_lens),
             )
 
     @pytest.fixture(scope="function")
@@ -1184,20 +1162,17 @@ def test_backend_selection(
             "max_draft_len",
             "draft_lens",
             "sampling_params_list",
-            "with_draft_logits",
             "params_label",
             "allow_zero_draft_len",
             "vocab_size",
         ),
         [
-            # NB: with_draft_logits=True and non-zero draft len ensures that
-            #     LlmRequest.py_target_probs is set.
+            # NB: non-zero draft len ensures that LlmRequest.py_target_probs is set.
             pytest.param(
                 use_flashinfer,
                 3,
                 [3] * len(sampling_params_list),
                 sampling_params_list,
-                True,
                 params_label,
                 False,
                 vocab_size,
@@ -1225,7 +1200,6 @@ def test_probs(
         allow_zero_draft_len: bool,  # used by fixtures
         sampling_params_list: list[SamplingParams],
         seq_slot_assignment: tuple[list[int], int],
-        with_draft_logits: bool,
     ):
         """Validate probabilities returned by sample_async.
 
@@ -1255,9 +1229,7 @@ def _uut_provider(is_warmup: bool) -> Generator[Callable[[], None], None, None]:
                 # requests.
                 uut_mock_requests = self._build_mock_requests(
                     sampling_params_list=sampling_params_list,
-                    vocab_size=vocab_size,
                     seq_slot_assignment=seq_slot_assignment,
-                    with_draft_logits=with_draft_logits,
                     draft_lens=draft_lens,
                 )
             else:
@@ -1427,11 +1399,8 @@ def _compute_probs(
         )
         mock_requests_with_probs = self._build_mock_requests(
             sampling_params_list=sampling_params_list,
-            vocab_size=vocab_size,
             seq_slot_assignment=seq_slot_assignment,
-            # NB: with_draft_logits=True and non-zero draft len ensures that
-            #     LlmRequest.py_target_probs is set.
-            with_draft_logits=True,
+            # NB: non-zero draft len ensures that LlmRequest.py_target_probs is set.
             draft_lens=([draft_len_with_probs] * len(sampling_params_list)),
         )
         # zero-pad logits to draft_len_with_probs
@@ -1818,6 +1787,12 @@ def _validate_token_frequencies(
 
         # Perform G-test (asymptotically approximated by Pearson's chi-square test) to
         # check that sampled tokens are consistent with the expected probs.
+        #
+        # NB: Need to use FP64 to avoid negative test statistic values.
+        test_token_counts_ma = test_token_counts_ma.astype(np.float64)
+        test_expected_counts_ma = test_expected_counts_ma.astype(np.float64)
+        test_expected_counts_ma /= test_expected_counts_ma.sum(axis=-1, keepdims=True)
+        test_expected_counts_ma *= num_samples
         test_result = power_divergence(
             f_obs=test_token_counts_ma,
             f_exp=test_expected_counts_ma,
@@ -1847,7 +1822,6 @@ def _validate_token_frequencies(
             "use_flashinfer",
             "max_draft_len",
             "sampling_params_list",
-            "with_draft_logits",
             "allow_zero_draft_len",
             "bypass_sampling",
             "vocab_size",
@@ -1857,7 +1831,6 @@ def _validate_token_frequencies(
                 use_flashinfer,
                 max_draft_len,
                 sampling_params_list,
-                with_draft_logits,
                 allow_zero_draft_len,
                 # Run full sampling test only for uniform batches, with/without probs, but skip
                 # sampling statistics when varying draft lens etc. to validate batch handling:
@@ -1868,22 +1841,20 @@ def _validate_token_frequencies(
                 id=(
                     f"{'FlashInfer' if use_flashinfer else 'Torch'}"
                     f"-draft_len={0 if allow_zero_draft_len else 1}..{max_draft_len}"
-                    f"-return_probs={with_draft_logits}-{params_label}"
+                    f"-{params_label}"
                 ),
             )
             # https://stackoverflow.com/a/75421799, does not work with nested loops
             for (
                 use_flashinfer,
                 is_mixed,
-                with_draft_logits,
                 max_draft_len,
                 allow_zero_draft_len,
                 _build_test_cases,
                 vocab_size,
             ) in product(
                 [False, True],
                 [False, True],
-                [True, False],
                 [0, 3],
                 [False, True],
                 [_build_test_cases],
@@ -1895,8 +1866,7 @@ def _validate_token_frequencies(
                 include_uniform=(not is_mixed),
                 include_mixed=is_mixed,
             )
-            if (allow_zero_draft_len or max_draft_len > 0)
-            and (not with_draft_logits or max_draft_len > 0)
+            if allow_zero_draft_len or max_draft_len > 0
         ],
     )
     def test_samples(
@@ -1908,7 +1878,6 @@ def test_samples(
         vocab_size: int,
         sampling_params_list: list[SamplingParams],
         seq_slot_assignment: tuple[list[int], int],
-        with_draft_logits: bool,
         max_draft_len: int,
         use_flashinfer: bool,
         allow_zero_draft_len: bool,  # used by fixtures
@@ -2038,7 +2007,7 @@ def _uut(res=res):
                 probs = probs[: (draft_len + 1)]
 
                 # check probs are returned only when needed
-                should_return_probs = draft_len and with_draft_logits
+                should_return_probs = bool(draft_len)
                 assert (
                     hasattr(req, "py_target_probs") and req.py_target_probs is not None
                 ) == should_return_probs