Skip to content

Commit 2e74a61

Browse files
committed
[KV cache manager] Simplify block allocation
We will be dealing with input with or without chunk prefill. This will be dealt by the outer control flow, namely the caller of addSequence. Treat addSequence simple by allocating the specified inputLength of blocks. Signed-off-by: eopXD <[email protected]>
1 parent ba3e3d6 commit 2e74a61

File tree

1 file changed

+5
-9
lines changed

1 file changed

+5
-9
lines changed

cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1751,7 +1751,8 @@ SizeType32 KVCacheManager::getNeededBlocksOneStep(
17511751

17521752
SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req, SizeType32 windowSize) const
17531753
{
1754-
1754+
TLLM_CHECK_WITH_INFO(
1755+
mSinkBlockTokenLength == 0 && mSinkBubbleLength == 0, "streamLLM is not supported at the moment");
17551756
if (isCrossKv())
17561757
{
17571758
if (req.isContextInitState() && req.getContextCurrentPosition() == 0)
@@ -1762,14 +1763,9 @@ SizeType32 KVCacheManager::getRemainingBlocksToCompletion(LlmRequest const& req,
17621763
return 0; // cross KV cache doesn't grow after the initial context phase
17631764
}
17641765

1765-
auto const temporaryAttentionWindow = mBlockManager.getWindowSizeMetadata(windowSize).temporaryAttentionWindow;
1766-
1767-
SizeType32 const numContextBlocks
1768-
= (std::min(req.mPromptLen, windowSize + temporaryAttentionWindow) + mSinkBubbleLength) / getTokensPerBlock();
1766+
SizeType32 const numContextBlocks = req.mPromptLen / getTokensPerBlock();
17691767

1770-
SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(
1771-
std::min(req.mPromptLen + req.mMaxNewTokens, windowSize + temporaryAttentionWindow) + mSinkBubbleLength,
1772-
getTokensPerBlock());
1768+
SizeType32 const numTotalBlocksPerBeam = tc::ceilDiv(req.mPromptLen + req.mMaxNewTokens, getTokensPerBlock());
17731769

17741770
SizeType32 const numGenBlocksPerBeam = numTotalBlocksPerBeam - numContextBlocks;
17751771

@@ -1898,7 +1894,7 @@ void KVCacheManager::addSequence(
18981894
auto const temporaryAttentionWindow = metadata.temporaryAttentionWindow;
18991895

19001896
// Consider the temporaryAttentionWindow when allocating blocks.
1901-
auto const effectiveInputLength = std::min(inputLength, maxTokenNum + temporaryAttentionWindow);
1897+
auto const effectiveInputLength = inputLength;
19021898
auto const numContextBlocks = tc::ceilDiv(effectiveInputLength, getTokensPerBlock());
19031899
if (!sequence.isCyclic() && mEnableBlockReuse)
19041900
{

0 commit comments

Comments
 (0)