@@ -584,16 +584,17 @@ def get_autotune_warmup_request():
584584
585585 available_blocks = kv_cache_manager .get_num_free_blocks ()
586586
587+ maximum_tunable_num_tokens = min (
588+ self .batch_size * num_tokens_per_request , self .max_num_tokens ,
589+ available_blocks * kv_cache_manager .tokens_per_block )
590+
587591 # Calculate number of full-length requests and remaining tokens
588592 # Each request has num_tokens_per_request tokens, except possibly the last one
589- full_len_request_num = self . max_num_tokens // num_tokens_per_request
590- remaining_tokens = self . max_num_tokens % num_tokens_per_request
593+ full_len_request_num = maximum_tunable_num_tokens // num_tokens_per_request
594+ remaining_tokens = maximum_tunable_num_tokens % num_tokens_per_request
591595
592596 request_num = full_len_request_num if remaining_tokens == 0 else full_len_request_num + 1
593597
594- if self .max_num_tokens > available_blocks * kv_cache_manager .tokens_per_block :
595- return None , None
596-
597598 requests = kv_cache_manager .add_dummy_requests (
598599 request_ids = list (range (full_len_request_num )),
599600 token_nums = [num_tokens_per_request ] * full_len_request_num ,
@@ -617,7 +618,7 @@ def get_autotune_warmup_request():
617618 result .context_requests = requests
618619 result .generation_requests = []
619620
620- return result , _create_extra_inputs (1 , self . max_num_tokens )
621+ return result , _create_extra_inputs (1 , maximum_tunable_num_tokens )
621622
622623 @contextlib .contextmanager
623624 def release_batch (result ):
0 commit comments