refactor: Remove maxNumSequences parameter from MakeDecodingBatchInputOutput

Funatiq · Funatiq · commit a3d7cd626e0f · 2025-07-08T13:42:51.000Z
- Removed maxNumSequences parameter from createDecoderBatchInputs and related function calls, streamlining the interface.
- Updated all relevant implementations and tests to reflect the changes in function signatures.

Signed-off-by: Robin Kobus &lt;19427718+Funatiq@users.noreply.github.com&gt;
diff --git a/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h b/cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h
@@ -48,12 +48,11 @@ class MakeDecodingBatchInputOutput : Algorithm
 
     void operator()(DecoderInputBuffers& inputBuffers, runtime::decoder::DecoderState& decoderState,
         RequestVector const& contextRequests, RequestVector const& generationRequests,
-        std::vector<TensorPtr> const& logits, runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
+        std::vector<TensorPtr> const& logits, runtime::ModelConfig const& modelConfig,
         OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
     static void createDecoderBatchInputs(DecoderInputBuffers& inputBuffers, std::vector<SizeType32> const& activeSlots,
-        runtime::decoder::DecoderState const& decoderState, std::vector<TensorPtr> const& logits,
-        SizeType32 maxNumSequences);
+        runtime::decoder::DecoderState const& decoderState, std::vector<TensorPtr> const& logits);
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp b/cpp/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.cpp
@@ -33,7 +33,7 @@ using TensorPtr = MakeDecodingBatchInputOutput::TensorPtr;
 
 void MakeDecodingBatchInputOutput::createDecoderBatchInputs(DecoderInputBuffers& inputBuffers,
     std::vector<SizeType32> const& activeSlots, runtime::decoder::DecoderState const& decoderState,
-    std::vector<TensorPtr> const& logits, SizeType32 maxNumSequences)
+    std::vector<TensorPtr> const& logits)
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
@@ -46,7 +46,7 @@ void MakeDecodingBatchInputOutput::createDecoderBatchInputs(DecoderInputBuffers&
 
     for (SizeType32 step = 0; step < maxDecoderSteps; ++step)
     {
-        batchSlots.at(step)->resize(maxNumSequences);
+        batchSlots.at(step)->resize(activeSlots.size());
     }
 
     std::vector<SizeType32> batchIdx(maxDecoderSteps);
@@ -181,14 +181,13 @@ void setEagleInputs(tr::DecodingInput& dInput, RuntimeBuffers const& fusedRuntim
 void MakeDecodingBatchInputOutput::operator()(DecoderInputBuffers& inputBuffers,
     runtime::decoder::DecoderState& decoderState, RequestVector const& contextRequests,
     RequestVector const& generationRequests, std::vector<TensorPtr> const& logits,
-    runtime::ModelConfig const& modelConfig, SizeType32 maxNumSequences,
-    OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const
+    runtime::ModelConfig const& modelConfig, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
 
     auto [activeSlots, generationSteps] = getActiveSlots(contextRequests, generationRequests);
 
-    createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, logits, maxNumSequences);
+    createDecoderBatchInputs(inputBuffers, activeSlots, decoderState, logits);
 
     auto const maxBeamWidth = decoderState.getMaxBeamWidth();
     if (maxBeamWidth > 1)
diff --git a/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp b/cpp/tensorrt_llm/batch_manager/trtGptModelInflightBatching.cpp
@@ -2008,7 +2008,7 @@ runtime::CudaEvent TrtGptModelInflightBatching::decoderStepAsync(ScheduledReques
     auto& fusedRuntimeBuffers = mBuffers.at(fusedBufferId);
 
     (*mMakeDecodingBatchInputOutput)(decoderInputBuffers, *mDecoderState, scheduledRequests.contextRequests,
-        scheduledRequests.generationRequests, seqSlotLogits, mModelConfig, getMaxNumSequences(), *fusedRuntimeBuffers);
+        scheduledRequests.generationRequests, seqSlotLogits, mModelConfig, *fusedRuntimeBuffers);
 
     auto decoderFinishEvent = mDecoder->forwardAsync(*mDecoderState, decoderInputBuffers);
 
diff --git a/cpp/tests/runtime/gptDecoderBatchedTest.cpp b/cpp/tests/runtime/gptDecoderBatchedTest.cpp
@@ -27,7 +27,6 @@
 #include "tensorrt_llm/runtime/iBuffer.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/runtimeKernels.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 #include <gmock/gmock-matchers.h>
@@ -354,7 +353,7 @@ void testDecoder(nvinfer1::DataType const dtype, std::vector<SamplingConfig>& sa
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
     tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-        inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+        inputBuffers, activeSlots, decoderState, decoderInputs.logits);
     decoder.forward(decoderState, inputBuffers);
 
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
@@ -484,7 +483,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
         auto activeSlots = std::vector<SizeType32>(batchIdx + 1);
         std::iota(activeSlots.begin(), activeSlots.end(), 0);
         tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-            inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+            inputBuffers, activeSlots, decoderState, decoderInputs.logits);
         decoder.forward(decoderState, inputBuffers);
 
         advanceSequenceLengths(
@@ -507,7 +506,7 @@ void testDecoderWavefront(nvinfer1::DataType const dtype, std::vector<SamplingCo
     while (!std::all_of(expectedFinished.begin(), expectedFinished.end(), [](bool finish) { return finish; }))
     {
         tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-            inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+            inputBuffers, activeSlots, decoderState, decoderInputs.logits);
         decoder.forward(decoderState, inputBuffers);
         finishedVec = getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager);
 
@@ -643,7 +642,7 @@ void testDecoderDraft(nvinfer1::DataType const dtype, std::vector<SamplingConfig
     auto activeSlots = std::vector<SizeType32>(batchSize);
     std::iota(activeSlots.begin(), activeSlots.end(), 0);
     tb::MakeDecodingBatchInputOutput::createDecoderBatchInputs(
-        inputBuffers, activeSlots, decoderState, decoderInputs.logits, batchSize);
+        inputBuffers, activeSlots, decoderState, decoderInputs.logits);
     decoder.forward(decoderState, inputBuffers);
     checkSequenceLengths(*decoderState.getSequenceLengths(), expectedLengths, manager);
     EXPECT_THAT(getFinished(*decoderState.getFinishedSum(), samplingConfigs, manager), ::testing::Each(false));