Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion 3rdparty/cutlass
Submodule cutlass updated 671 files
13 changes: 7 additions & 6 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -584,16 +584,17 @@ def get_autotune_warmup_request():

available_blocks = kv_cache_manager.get_num_free_blocks()

maximum_tunable_num_tokens = min(
self.batch_size * num_tokens_per_request, self.max_num_tokens,
available_blocks * kv_cache_manager.tokens_per_block)

# Calculate number of full-length requests and remaining tokens
# Each request has num_tokens_per_request tokens, except possibly the last one
full_len_request_num = self.max_num_tokens // num_tokens_per_request
remaining_tokens = self.max_num_tokens % num_tokens_per_request
full_len_request_num = maximum_tunable_num_tokens // num_tokens_per_request
remaining_tokens = maximum_tunable_num_tokens % num_tokens_per_request

request_num = full_len_request_num if remaining_tokens == 0 else full_len_request_num + 1

if self.max_num_tokens > available_blocks * kv_cache_manager.tokens_per_block:
return None, None

requests = kv_cache_manager.add_dummy_requests(
request_ids=list(range(full_len_request_num)),
token_nums=[num_tokens_per_request] * full_len_request_num,
Expand All @@ -617,7 +618,7 @@ def get_autotune_warmup_request():
result.context_requests = requests
result.generation_requests = []

return result, _create_extra_inputs(1, self.max_num_tokens)
return result, _create_extra_inputs(1, maximum_tunable_num_tokens)

@contextlib.contextmanager
def release_batch(result):
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -443,7 +443,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_
unittest/_torch/auto_deploy/integration/test_ad_build.py SKIP (https://nvbugs/5318103)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=False] SKIP (https://nvbugs/5318143)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp2pp2-attn_backend=TRTLLM-torch_compile=True] SKIP (https://nvbugs/5318143)
test_e2e.py::test_openai_reasoning SKIP (https://nvbugs/5310329)
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-summarization_long] SKIP (https://nvbugs/5324976)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=False] SKIP (https://nvbugs/5322354)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_nextn=0-overlap_scheduler=True] SKIP (https://nvbugs/5322354)
Expand Down
1 change: 0 additions & 1 deletion tests/unittest/llmapi/test_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -1930,7 +1930,6 @@ def test_llm_get_stats(return_context_logits, enable_iter_req_stats):


def test_llm_get_queued_stats():
pytest.skip("https://nvbugspro.nvidia.com/bug/5325642")
enable_iter_req_stats = True
use_overlap = False
tp_size = 1
Expand Down