Optimize attention prior computation: reuse qkv and cross_kv, dont copy scores to cpu

vklimkov-nvidia · vklimkov-nvidia · commit 1c7f92708d8b · 2025-06-06T06:18:51.000-07:00
Signed-off-by: Viacheslav Klimkov &lt;vklimkov@nvidia.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h b/cpp/include/tensorrt_llm/batch_manager/runtimeBuffers.h
@@ -287,7 +287,6 @@ class RuntimeBuffers
         DecoderBuffers& decoderBuffers, runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
         runtime::WorldConfig const& worldConfig);
 
-    std::vector<float> getScoresHost(runtime::TllmRuntime const& runtime);
     void setAttentionPriorIdx(RequestVector const& contextRequests, RequestVector const& genRequests,
         runtime::TllmRuntime const& runtime);
 
diff --git a/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp b/cpp/tensorrt_llm/batch_manager/runtimeBuffers.cpp
@@ -261,7 +261,7 @@ void RuntimeBuffers::create(SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
 
     inputsIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
     if (useAttentionPrior) {
-        scores = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
+        scores = manager.emptyTensor(MemoryType::kGPU, modelConfig.getDataType());
     }
     if (worldConfig.isPipelineParallel())
     {
@@ -919,23 +919,19 @@ void RuntimeBuffers::prepareEagleBuffers(RequestVector const& contextRequests, R
     TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }
 
-std::vector<float> RuntimeBuffers::getScoresHost(runtime::TllmRuntime const& runtime) 
-{
-    auto const& manager = runtime.getBufferManager();
-    auto const& stream = runtime.getStream();
-    std::vector<float> scoresHost;
-    if (!useAttentionPrior) {
-        TLLM_LOG_WARNING("Getting scores, when attention prior is disabled");
-        return scoresHost;
-    }
-    auto scoresShape = scores->getShape();
-    auto scoresSize = ITensor::volume(scoresShape);
-    if (scoresSize > 0) {
-        scoresHost.resize(scoresSize);
-        manager.copy(*scores, scoresHost.data());
-        stream.synchronize();  // Ensure copy completes
+template<typename T>
+static SizeType32 processScoresWithType(ITensor* scoresHost, SizeType32 prevPriorIdxLen) {
+    auto* scoresHostPtr = bufferCast<T>(*scoresHost);
+    T maxScore = scoresHostPtr[0];
+    SizeType32 maxScoreIdx = 0;
+    // Find the index with maximum score in the current subsection
+    for (SizeType32 k = 1; k < prevPriorIdxLen; ++k) {
+        if (scoresHostPtr[k] > maxScore) {
+            maxScore = scoresHostPtr[k];
+            maxScoreIdx = k;
+        }
     }
-    return scoresHost;
+    return maxScoreIdx;
 }
 
 void RuntimeBuffers::setAttentionPriorIdx(
@@ -961,47 +957,58 @@ void RuntimeBuffers::setAttentionPriorIdx(
         totalEncoderOutputLen += llmReq->getEncoderOutputLen();
     }
 
-    SizeType32 offset = 0;
+    SizeType32 qOffset = 0;
     // we skip all context requests
     for (auto const& llmReq : contextRequests) {
-        offset += llmReq->getContextChunkSize() * totalEncoderOutputLen;
+        qOffset += llmReq->getContextChunkSize();
         // for context we just focusing at the beginning of the encoder sequence
         llmReq->setAttentionPriorIdx(0);
     }
 
-    std::vector<float> scoresHost = getScoresHost(runtime);
+    // create a cpu buffer for scores to find max score in
+    SizeType32 searchLength = 10;
+    auto const& manager = runtime.getBufferManager();
+    auto const& stream = runtime.getStream();
+    auto scoresHost = manager.cpu(ITensor::makeShape({searchLength}), scores->getDataType());
 
     // for generation requests, there is no context,
     // but we need to find correct section in (b * encoder_output_len)
     for (SizeType32 i = 0; i < (SizeType32)genRequests.size(); ++i) {
         // skip the context
-        offset += totalContextEncoderOutputLen;
+        SizeType32 kvOffset = totalContextEncoderOutputLen;
         for (SizeType32 j = 0; j < (SizeType32)genRequests.size(); ++j) {
             auto const& llmReq = genRequests[j];
             SizeType32 encoderOutputLen = llmReq->getEncoderOutputLen();
             if (i == j) {
                 // find attnetion prior idx in range [prev_prior_idx; prev_prior_idx + 10]
                 SizeType32 prevPriorIdx = llmReq->getAttentionPriorIdx();
                 // ignore last 3 tokens, move strictly forward, look up to 10 tokens forward
-                SizeType32 prevPriorIdxEnd = std::min(prevPriorIdx + 10, encoderOutputLen - 3);
-
-                // find maximum score and it's index in current subsection of scores buffer
-                SizeType32 maxScoreIdx = prevPriorIdx;
-                SizeType32 maxScore = scoresHost[offset + prevPriorIdx];
-                
-                // Find the index with maximum score in the current subsection
-                for (SizeType32 k = prevPriorIdx + 1; k < prevPriorIdxEnd; ++k) {
-                    if (scoresHost[offset + k] > maxScore) {
-                        maxScore = scoresHost[offset + k];
-                        maxScoreIdx = k;
-                    }
+                SizeType32 prevPriorIdxEnd = std::min(prevPriorIdx + searchLength, encoderOutputLen);
+                SizeType32 prevPriorIdxLen = prevPriorIdxEnd - prevPriorIdx;
+
+                // slice relevant section of scores
+                auto scoresSlice = ITensor::slice(scores, {qOffset, kvOffset + prevPriorIdx}, prevPriorIdxLen);
+                // copies and converts to float
+                scoresHost->reshape(ITensor::makeShape({prevPriorIdxLen}));
+                manager.copy(*scoresSlice, *scoresHost);
+                stream.synchronize();
+
+                // find index of maximum score in the window
+                SizeType32 maxScoreIdx = 0;
+                if (scores->getDataType() == nvinfer1::DataType::kFLOAT) {
+                    maxScoreIdx = processScoresWithType<float>(scoresHost.get(), prevPriorIdxLen);
+                } else if (scores->getDataType() == nvinfer1::DataType::kHALF) {
+                    maxScoreIdx = processScoresWithType<half>(scoresHost.get(), prevPriorIdxLen);
+                } else {
+                    TLLM_LOG_WARNING("Unsupported scores data type");
                 }
 
                 // Set the attention prior index to the position with maximum score
-                llmReq->setAttentionPriorIdx(maxScoreIdx);
+                llmReq->setAttentionPriorIdx(prevPriorIdx + maxScoreIdx);
             }
-            offset += encoderOutputLen;
+            kvOffset += encoderOutputLen;
         }
+        qOffset += 1;
     }
 }
 
diff --git a/examples/models/contrib/t5tts/convert_checkpoint.py b/examples/models/contrib/t5tts/convert_checkpoint.py
@@ -305,12 +305,6 @@ def convert_t5tts_decoder(
             model_dict[f't5_decoder.layers.{i}.cross_attention.q_net.weight'],
             model_dict[f't5_decoder.layers.{i}.cross_attention.kv_net.weight']
         ], dim=0).contiguous()
-        # projections to compute attention scores
-        weights[f'decoder_layers.{i}.q_proj.weight'] = model_dict[
-            f't5_decoder.layers.{i}.cross_attention.q_net.weight'].contiguous()
-        kv_weight = model_dict[f't5_decoder.layers.{i}.cross_attention.kv_net.weight']
-        dim = kv_weight.shape[0] // 2
-        weights[f'decoder_layers.{i}.k_proj.weight'] = kv_weight[:dim, :]
 
         weights[f'decoder_layers.{i}.cross_attention.qkv.weight'] = qkv_weight
         weights[f'decoder_layers.{i}.cross_attention.dense.weight'] = model_dict[
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
@@ -1555,9 +1555,9 @@ def transpose_for_scores(x,
             context = dense_conditional.add_output(skip_case, context)
 
         if use_cache:
-            return (context, past_key_value)
+            return (context, qkv, cross_kv, past_key_value)
         else:
-            return context
+            return (context, qkv, cross_kv)
 
     def set_rel_attn_table(self, max_seq_len, precomputed_relative_attention):
         self.rel_attn_table = Parameter(shape=(self.num_attention_heads,
diff --git a/tensorrt_llm/models/t5tts/model.py b/tensorrt_llm/models/t5tts/model.py
@@ -25,7 +25,7 @@
                                      LayerNormType, MLPType,
                                      PositionEmbeddingType, Tensor, assertion,
                                      concat, gather_last_token_logits, maximum,
-                                     minimum, recv, select, send, shape, view, mean, add,
+                                     minimum, recv, select, send, shape, view, mean, add, slice,
                                      squeeze, unsqueeze, transpose, matmul, stack, cast)
 from tensorrt_llm.layers import (MLP, Attention, AttentionMaskParams,
                                  AttentionMaskType, AttentionParams,
@@ -428,6 +428,7 @@ def __init__(self,
 
         # e.g. BART post, T5 pre
         self.layernorm_position = layernorm_position
+        self.hidden_size = hidden_size
 
         # e.g. BART q_scaling = 1.f, T5 q_scaling = 1.f/sqrt(head_size)
         self.self_attention = Attention(
@@ -455,26 +456,6 @@ def __init__(self,
                                                 eps=layernorm_eps,
                                                 dtype=dtype, bias=False)
 
-        # to compute cross attention scores
-        self.q_proj = ColumnLinear(
-            hidden_size,
-            hidden_size,
-            bias=False,
-            dtype=dtype,
-            tp_group=mapping.tp_group,
-            tp_size=mapping.tp_size,
-            gather_output=True,
-        )
-        self.k_proj = ColumnLinear(
-            hidden_size,
-            hidden_size,
-            bias=False,
-            dtype=dtype,
-            tp_group=mapping.tp_group,
-            tp_size=mapping.tp_size,
-            gather_output=True,
-        )
-
         # Note: self attn uses MMHA, mask is always causal triangular
         # cross attn has two scenarios:
         # - in context phase, all ones mask, same as padding type
@@ -558,18 +539,16 @@ def forward(self,
             kv_cache_params=kv_cache_params,
             attention_params=attention_params)
         if use_cache:
-            attention_output, presents_self = attention_output
+            attention_output, _, _, presents_self = attention_output
+        else:
+            attention_output, _, _ = attention_output
         hidden_states = residual + attention_output
 
         # cross attention
         residual = hidden_states
 
         hidden_states = self.cross_attention_layernorm(hidden_states)
         encoder_output = self.cross_attention_memory_layernorm(encoder_output)
-        # compute attention scores
-        q = cast(self.q_proj(hidden_states), "float32")   # b * context x hidden
-        k = cast(self.k_proj(encoder_output), "float32")  # b * enc x hidden
-        scores = matmul(q, k, transb=True)  # b * context x b * enc
         attention_output = self.cross_attention(
             hidden_states=hidden_states,
             attention_mask=attention_mask_params.cross_attention_mask,
@@ -582,9 +561,21 @@ def forward(self,
             cross_kv_cache_gen=cross_kv_cache_gen,
             cross_kv_reuse=cross_kv_reuse)
         if use_cache:
-            attention_output, presents_cross = attention_output
+            attention_output, qkv, cross_kv, presents_cross = attention_output
+        else:
+            attention_output, qkv, cross_kv = attention_output
         hidden_states = residual + attention_output
 
+        # compute attention scores
+        # TODO: assumes padding disabled
+        q = slice(qkv, concat([0, 0]), concat([shape(qkv, 0), self.hidden_size]))
+        k = slice(cross_kv, concat([0, 0]), concat([shape(cross_kv, 0), self.hidden_size]))
+        scores = matmul(
+            q,
+            k,
+            transb=True
+        )
+
         # conv ff (norm -> conv -> residual)
         residual = hidden_states
         hidden_states = self.pos_ff_layernorm(hidden_states)