NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h‎
Lines changed: 7 additions & 4 deletions b/‎cpp/include/tensorrt_llm/batch_manager/decoderBuffers.h‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/guidedDecoder.h‎
Lines changed: 2 additions & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/guidedDecoder.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h‎
Lines changed: 7 additions & 6 deletions b/‎cpp/include/tensorrt_llm/batch_manager/logitsPostProcessor.h‎
Lines changed: 7 additions & 6 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h‎
Lines changed: 1 addition & 2 deletions b/‎cpp/include/tensorrt_llm/batch_manager/makeDecodingBatchInputOutput.h‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp‎
Lines changed: 1 addition & 3 deletions b/‎cpp/tensorrt_llm/batch_manager/decoderBuffers.cpp‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp‎
Lines changed: 18 additions & 22 deletions b/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp‎
Lines changed: 18 additions & 22 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp‎
Lines changed: 28 additions & 16 deletions b/‎cpp/tensorrt_llm/batch_manager/handleContextLogits.cpp‎
Lines changed: 28 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp‎
Lines changed: 13 additions & 4 deletions b/‎cpp/tensorrt_llm/batch_manager/handleGenerationLogits.cpp‎
Lines changed: 13 additions & 4 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp‎
Lines changed: 22 additions & 30 deletions b/‎cpp/tensorrt_llm/batch_manager/logitsPostProcessor.cpp‎
Lines changed: 22 additions & 30 deletions
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
@@ -38,8 +39,8 @@ class DecoderInputBuffers
     using SizeType32 = runtime::SizeType32;
     using TensorPtr = runtime::ITensor::SharedPtr;
 
-    explicit DecoderInputBuffers(SizeType32 maxNumSequences, SizeType32 maxBatchSize, SizeType32 maxDecoderSteps,
-        runtime::BufferManager const& manager);
+    explicit DecoderInputBuffers(
+        SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, runtime::BufferManager const& manager);
 
     void setupMedusaLogits(SizeType32 maxNumSequences, runtime::ModelConfig const& modelConfig);
 
@@ -56,11 +57,13 @@ class DecoderInputBuffers
 
     //! Buffers for decoder forward
 
+    //! Requests for considered in decoder forward
+    RequestVector decoderRequests;
+
     //! Batch slots for all decoder steps, [maxDecoderSteps][maxBatchSize]
     std::vector<TensorPtr> forwardBatchSlots;
 
-    //! Logits for all batch slots, [maxNumSequences]
-    //! The vector is sparse, only slots in forwardBatchSlots are used.
+    //! Logits of decoder requests
     std::vector<TensorPtr> logits;
 
     //! Logits for speculative decoding (Medusa)
 
@@ -29,6 +29,7 @@ class GrammarCompiler;
 
 namespace tensorrt_llm::batch_manager
 {
+class DecoderInputBuffers;
 
 class GuidedDecoder
 {
@@ -40,8 +41,7 @@ class GuidedDecoder
     GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodingConfig, SizeType32 maxNumSequences,
         SizeType32 vocabSizePadded, nvinfer1::DataType logitsDtype, runtime::BufferManager const& runtimeBufferManager);
     void build(ScheduledRequests const& scheduledRequests);
-    void execute(ScheduledRequests const& scheduledRequests, runtime::BufferManager const& runtimeBufferManager,
-        std::vector<TensorPtr> const& decoderBuffersLogits);
+    void execute(DecoderInputBuffers const& decoderInputBuffers, runtime::BufferManager const& runtimeBufferManager);
 
 private:
     executor::GuidedDecodingConfig::GuidedDecodingBackend mGuidedDecodingBackend;
 
@@ -24,28 +24,29 @@
 
 namespace tensorrt_llm::runtime
 {
-class TllmRuntime;
+class CudaStream;
 }
 
 namespace tensorrt_llm::batch_manager
 {
+class DecoderInputBuffers;
 
 class LogitsPostProcessor : Algorithm
 {
 public:
+    using CudaStreamPtr = std::shared_ptr<runtime::CudaStream>;
+
     using LogitsPostProcessorBatched = std::function<void(std::vector<batch_manager::LlmRequest::RequestIdType> const&,
         std::vector<batch_manager::LlmRequest::TensorPtr>&,
-        std::vector<std::reference_wrapper<batch_manager::LlmRequest::BeamTokens const>> const&,
-        runtime::BufferManager::CudaStreamPtr const&,
+        std::vector<std::reference_wrapper<batch_manager::LlmRequest::BeamTokens const>> const&, CudaStreamPtr const&,
         std::vector<std::optional<batch_manager::LlmRequest::RequestIdType>> const&)>;
 
     constexpr static auto name{"LogitsPostProcessor"};
 
     LogitsPostProcessor() = default;
 
-    bool operator()(RequestVector const& contextRequests, RequestVector const& generationRequests,
-        bool replicateLogitsPostProcessor, std::vector<batch_manager::LlmRequest::TensorPtr>& seqSlotLogits,
-        runtime::WorldConfig const& worldConfig, runtime::TllmRuntime& runtime,
+    bool operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
+        runtime::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
         std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched = std::nullopt) const;
 };
 
 
@@ -46,8 +46,7 @@ class MakeDecodingBatchInputOutput : Algorithm
 
     MakeDecodingBatchInputOutput() = default;
 
-    std::unique_ptr<runtime::decoder_batch::Input> operator()(RequestVector const& contextRequests,
-        RequestVector const& generationRequests, DecoderInputBuffers const& inputBuffers,
+    std::unique_ptr<runtime::decoder_batch::Input> operator()(DecoderInputBuffers& inputBuffers,
         runtime::decoder::DecoderState& decoderState, runtime::ModelConfig const& modelConfig,
         SizeType32 maxNumSequences, OptionalRef<RuntimeBuffers> fusedRuntimeBuffers) const;
 
 
@@ -31,7 +31,7 @@ namespace tensorrt_llm::batch_manager
 {
 
 DecoderInputBuffers::DecoderInputBuffers(
-    SizeType32 maxNumSequences, SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, BufferManager const& manager)
+    SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, BufferManager const& manager)
 {
     auto const maxBatchSizeShape = ITensor::makeShape({maxBatchSize});
     auto const nvSizeType = TRTDataType<SizeType32>::value;
@@ -49,8 +49,6 @@ DecoderInputBuffers::DecoderInputBuffers(
     {
         forwardBatchSlots.emplace_back(BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize}), nvSizeType));
     }
-
-    logits.resize(maxNumSequences);
 }
 
 void DecoderInputBuffers::setupMedusaLogits(SizeType32 maxNumSequences, ModelConfig const& modelConfig)
 
@@ -16,6 +16,7 @@
  */
 
 #include "tensorrt_llm/batch_manager/guidedDecoder.h"
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/kernels/logitsBitmask.h"
 
@@ -136,8 +137,7 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
     }
 }
 
-void GuidedDecoder::execute(ScheduledRequests const& scheduledRequests, BufferManager const& runtimeBufferManager,
-    std::vector<TensorPtr> const& decoderBuffersLogits)
+void GuidedDecoder::execute(DecoderInputBuffers const& decoderInputBuffers, BufferManager const& runtimeBufferManager)
 {
     auto const& stream = runtimeBufferManager.getStream();
 
@@ -150,32 +150,28 @@ void GuidedDecoder::execute(ScheduledRequests const& scheduledRequests, BufferMa
     mCopyBufferManager.getStream().record(event);
     stream.wait(event);
 
-    SizeType32 batchIdx{0};
-    if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR)
+    if (mGuidedDecodingBackend == executor::GuidedDecodingConfig::GuidedDecodingBackend::kXGRAMMAR
+        && !decoderInputBuffers.decoderRequests.empty())
     {
-        for (auto const& requests : {scheduledRequests.contextRequests, scheduledRequests.generationRequests})
+        SizeType32 batchIdx{0};
+        for (size_t requestIdx = 0; requestIdx < decoderInputBuffers.decoderRequests.size(); ++requestIdx)
         {
-            for (auto const& llmReq : requests)
+            auto const& llmReq = decoderInputBuffers.decoderRequests.at(requestIdx);
+
+            auto const& guidedDecodingParams = llmReq->getGuidedDecodingParams();
+            if (guidedDecodingParams.has_value())
             {
-                if (llmReq->isContextInitState() && !llmReq->isLastContextChunk())
-                {
-                    continue;
-                }
-                auto const& guidedDecodingParams = llmReq->getGuidedDecodingParams();
-                if (guidedDecodingParams.has_value())
-                {
-                    auto const seqSlot = llmReq->mSeqSlot.value();
+                auto const seqSlot = llmReq->mSeqSlot.value();
 
-                    auto const& logits = decoderBuffersLogits.at(seqSlot);
-                    auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});
+                auto const& logits = decoderInputBuffers.logits.at(requestIdx);
+                auto const logitsBitmask = ITensor::at(mLogitsBitmask, {seqSlot});
 
-                    // Use void* to unify the code for different mLogitsDtype
-                    *reinterpret_cast<void**>(ITensor::at(mLogitsPtrVecHost, {batchIdx})->data()) = logits->data();
-                    *reinterpret_cast<void**>(ITensor::at(mLogitsBitmaskPtrVecHost, {batchIdx})->data())
-                        = logitsBitmask->data();
+                // Use void* to unify the code for different mLogitsDtype
+                *reinterpret_cast<void**>(ITensor::at(mLogitsPtrVecHost, {batchIdx})->data()) = logits->data();
+                *reinterpret_cast<void**>(ITensor::at(mLogitsBitmaskPtrVecHost, {batchIdx})->data())
+                    = logitsBitmask->data();
 
-                    ++batchIdx;
-                }
+                ++batchIdx;
             }
         }
         if (batchIdx > 0)
 
@@ -76,6 +76,13 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(HandleContextLogits);
 
+    auto& decoderRequests = inputBuffers.decoderRequests;
+    decoderRequests.clear();
+    decoderRequests.reserve(contextRequests.size());
+    auto& allDecoderLogits = inputBuffers.logits;
+    allDecoderLogits.clear();
+    allDecoderLogits.reserve(contextRequests.size());
+
     SizeType32 batchIndex{0};
     SizeType32 logitsIndex{0};
     // Copy logits into decoderBuffers.logits
@@ -115,7 +122,6 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
         // Get the logits from the last context token and draft tokens
         auto const numDecoderLogits = 1 + draftLength;
         auto const seqSlot = llmReq->mSeqSlot.value();
-        auto& decoderLogits = inputBuffers.logits.at(seqSlot);
         TensorPtr logitsView = ITensor::slice(logits, logitsIndex - numDecoderLogits, numDecoderLogits);
 
         if (modelConfig.getSpeculativeDecodingMode().hasDraftLogits())
@@ -136,22 +142,28 @@ SizeType32 HandleContextLogits::operator()(DecoderInputBuffers& inputBuffers, Re
 
         TLLM_CHECK_DEBUG_WITH_INFO(tru::tensorHasInvalid<float>(*logitsView, manager, "logits") == false,
             "Found invalid number (NaN or Inf) in logits");
-        // Scatter the output logits to the decoderLogits
-        auto const reqBeamWidth = llmReq->getBeamWidthByIter();
-        if (reqBeamWidth > 1)
-        {
-            // Tile logits of context requests
-            auto const logitsShape = logitsView->getShape();
-            auto const logitsType = logitsView->getDataType();
-            decoderLogits = manager.gpu(ITensor::makeShape({reqBeamWidth, logitsShape.d[1]}), logitsType);
-            tensorrt_llm::runtime::kernels::tileTensor(*decoderLogits, *logitsView, reqBeamWidth, manager.getStream());
-            decoderLogits->unsqueeze(0);
-        }
-        else
+
+        if (llmReq->isLastContextChunk())
         {
-            auto const logitsViewShape = logitsView->getShape();
-            decoderLogits
-                = ITensor::view(logitsView, ITensor::makeShape({logitsViewShape.d[0], 1, logitsViewShape.d[1]}));
+            TensorPtr decoderLogits;
+            auto const reqBeamWidth = llmReq->getBeamWidthByIter();
+            if (reqBeamWidth > 1)
+            {
+                // Tile logits of context requests
+                auto const& logitsShape = logitsView->getShape();
+                auto const logitsType = logitsView->getDataType();
+                decoderLogits = manager.gpu(ITensor::makeShape({reqBeamWidth, logitsShape.d[1]}), logitsType);
+                tensorrt_llm::runtime::kernels::tileTensor(
+                    *decoderLogits, *logitsView, reqBeamWidth, manager.getStream());
+                decoderLogits->unsqueeze(0);
+            }
+            else
+            {
+                decoderLogits = logitsView;
+                decoderLogits->unsqueeze(1);
+            }
+            decoderRequests.push_back(llmReq);
+            allDecoderLogits.emplace_back(std::move(decoderLogits));
         }
 
         ++batchIndex;
 
@@ -22,6 +22,7 @@
 #include "tensorrt_llm/batch_manager/medusaBuffers.h"
 #include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/batch_manager/utils/inflightBatchingUtils.h"
+#include "tensorrt_llm/common/assert.h"
 #include "tensorrt_llm/common/nvtxUtils.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/utils/debugUtils.h"
@@ -82,6 +83,11 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(HandleGenerationLogits);
 
+    auto& decoderRequests = inputBuffers.decoderRequests;
+    decoderRequests.reserve(decoderRequests.size() + generationRequests.size());
+    auto& allDecoderLogits = inputBuffers.logits;
+    allDecoderLogits.reserve(allDecoderLogits.size() + generationRequests.size());
+
     for (auto const& llmReq : generationRequests)
     {
         auto const reqBeamWidth = llmReq->getBeamWidthByIter();
@@ -101,18 +107,21 @@ void HandleGenerationLogits::operator()(DecoderInputBuffers& inputBuffers, Reque
         TensorPtr logitsView = ITensor::slice(logits, logitsIndex, numLogits);
         TLLM_CHECK_DEBUG_WITH_INFO(tru::tensorHasInvalid<float>(*logitsView, manager, "logits") == false,
             "Found invalid number (NaN or Inf) in logits");
-        auto& decoderLogits = inputBuffers.logits.at(seqSlot);
-        auto const logitsViewShape = logitsView->getShape();
+
+        TLLM_CHECK(llmReq->isGenerationInProgressState());
+        TensorPtr decoderLogits;
         if (reqBeamWidth > 1)
         {
             decoderLogits = logitsView;
             decoderLogits->unsqueeze(0);
         }
         else
         {
-            decoderLogits
-                = ITensor::view(logitsView, ITensor::makeShape({logitsViewShape.d[0], 1, logitsViewShape.d[1]}));
+            decoderLogits = logitsView;
+            decoderLogits->unsqueeze(1);
         }
+        decoderRequests.push_back(llmReq);
+        allDecoderLogits.emplace_back(std::move(decoderLogits));
 
         if (llmReq->getReturnGenerationLogits())
         {
 
@@ -17,25 +17,24 @@
 
 #include "tensorrt_llm/batch_manager/logitsPostProcessor.h"
 
+#include "tensorrt_llm/batch_manager/decoderBuffers.h"
 #include "tensorrt_llm/batch_manager/llmRequest.h"
 #include "tensorrt_llm/batch_manager/runtimeBuffers.h"
 #include "tensorrt_llm/common/nvtxUtils.h"
 #include "tensorrt_llm/runtime/iTensor.h"
-#include "tensorrt_llm/runtime/tllmRuntime.h"
 
 namespace tr = tensorrt_llm::runtime;
 
 namespace tensorrt_llm::batch_manager
 {
 
-using BufferManager = tensorrt_llm::runtime::BufferManager;
 using TensorPtr = runtime::ITensor::SharedPtr;
 using ITensor = runtime::ITensor;
 using SizeType32 = tensorrt_llm::runtime::SizeType32;
 
-bool LogitsPostProcessor::operator()(RequestVector const& contextRequests, RequestVector const& generationRequests,
-    bool replicateLogitsPostProcessor, std::vector<TensorPtr>& seqSlotLogits, tr::WorldConfig const& worldConfig,
-    tr::TllmRuntime& runtime, std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const
+bool LogitsPostProcessor::operator()(DecoderInputBuffers& inputBuffers, bool replicateLogitsPostProcessor,
+    tr::WorldConfig const& worldConfig, CudaStreamPtr const& stream,
+    std::optional<LogitsPostProcessorBatched> logitsPostProcessorBatched) const
 {
     TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
     NVTX3_SCOPED_RANGE(LogitsPostProcessor);
@@ -47,35 +46,28 @@ bool LogitsPostProcessor::operator()(RequestVector const& contextRequests, Reque
     std::vector<std::optional<LlmRequest::RequestIdType>> clientIdsVec;
 
     bool logitsPostProcessorIsApplied = false;
-    for (auto const& requests : {contextRequests, generationRequests})
+    for (size_t batchIdx = 0; batchIdx < inputBuffers.decoderRequests.size(); ++batchIdx)
     {
-        for (auto const& llmReq : requests)
+        auto const& llmReq = inputBuffers.decoderRequests.at(batchIdx);
+        auto& logits = inputBuffers.logits.at(batchIdx);
+
+        // Invoke non-batched processor or collect arguments for batched processor
+        if (llmReq->mLogitsPostProcessor)
         {
-            if (llmReq->isContextInitState() ? llmReq->isLastContextChunk() : llmReq->isGenerationInProgressState())
+            logitsPostProcessorIsApplied = true;
+            if (replicateLogitsPostProcessor || worldConfig.isFirstTensorParallelRank())
             {
-                // Invoke non-batched processor or collect arguments for batched processor
-                if (llmReq->mLogitsPostProcessor)
-                {
-                    logitsPostProcessorIsApplied = true;
-                    if (replicateLogitsPostProcessor || worldConfig.isFirstTensorParallelRank())
-                    {
-                        auto& logits = seqSlotLogits.at(llmReq->mSeqSlot.value());
-                        (*llmReq->mLogitsPostProcessor)(
-                            llmReq->mRequestId, logits, llmReq->getTokens(), runtime.getStreamPtr(), llmReq->mClientId);
-                    }
-                }
-                else if (llmReq->mApplyLogitsPostProcessorBatched)
-                {
-                    reqIdsVec.push_back(llmReq->mRequestId);
-
-                    auto& logits = seqSlotLogits.at(llmReq->mSeqSlot.value());
-                    logitsVec.push_back(logits);
-
-                    beamTokensVec.emplace_back(llmReq->getTokens());
-                    clientIdsVec.push_back(llmReq->mClientId);
-                }
+                (*llmReq->mLogitsPostProcessor)(
+                    llmReq->mRequestId, logits, llmReq->getTokens(), stream, llmReq->mClientId);
             }
         }
+        else if (llmReq->mApplyLogitsPostProcessorBatched)
+        {
+            reqIdsVec.push_back(llmReq->mRequestId);
+            logitsVec.push_back(logits);
+            beamTokensVec.emplace_back(llmReq->getTokens());
+            clientIdsVec.push_back(llmReq->mClientId);
+        }
     }
 
     // Invoke batched processor
@@ -84,7 +76,7 @@ bool LogitsPostProcessor::operator()(RequestVector const& contextRequests, Reque
         logitsPostProcessorIsApplied = true;
         if (replicateLogitsPostProcessor || worldConfig.isFirstTensorParallelRank())
         {
-            (*logitsPostProcessorBatched)(reqIdsVec, logitsVec, beamTokensVec, runtime.getStreamPtr(), clientIdsVec);
+            (*logitsPostProcessorBatched)(reqIdsVec, logitsVec, beamTokensVec, stream, clientIdsVec);
         }
     }
Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ class GrammarCompiler;`
`29`	`29`
`30`	`30`	`namespace tensorrt_llm::batch_manager`
`31`	`31`	`{`
	`32`	`+class DecoderInputBuffers;`
`32`	`33`
`33`	`34`	`class GuidedDecoder`
`34`	`35`	`{`
`@@ -40,8 +41,7 @@ class GuidedDecoder`
`40`	`41`	`GuidedDecoder(executor::GuidedDecodingConfig const& guidedDecodingConfig, SizeType32 maxNumSequences,`
`41`	`42`	`SizeType32 vocabSizePadded, nvinfer1::DataType logitsDtype, runtime::BufferManager const& runtimeBufferManager);`
`42`	`43`	`void build(ScheduledRequests const& scheduledRequests);`
`43`		`- void execute(ScheduledRequests const& scheduledRequests, runtime::BufferManager const& runtimeBufferManager,`
`44`		`- std::vector<TensorPtr> const& decoderBuffersLogits);`
	`44`	`+ void execute(DecoderInputBuffers const& decoderInputBuffers, runtime::BufferManager const& runtimeBufferManager);`
`45`	`45`
`46`	`46`	`private:`
`47`	`47`	`executor::GuidedDecodingConfig::GuidedDecodingBackend mGuidedDecodingBackend;`
Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,7 @@ namespace tensorrt_llm::batch_manager`
`31`	`31`	`{`
`32`	`32`
`33`	`33`	`DecoderInputBuffers::DecoderInputBuffers(`
`34`		`- SizeType32 maxNumSequences, SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, BufferManager const& manager)`
	`34`	`+ SizeType32 maxBatchSize, SizeType32 maxDecoderSteps, BufferManager const& manager)`
`35`	`35`	`{`
`36`	`36`	`auto const maxBatchSizeShape = ITensor::makeShape({maxBatchSize});`
`37`	`37`	`auto const nvSizeType = TRTDataType<SizeType32>::value;`
`@@ -49,8 +49,6 @@ DecoderInputBuffers::DecoderInputBuffers(`
`49`	`49`	`{`
`50`	`50`	`forwardBatchSlots.emplace_back(BufferManager::pinnedPool(ITensor::makeShape({maxBatchSize}), nvSizeType));`
`51`	`51`	`}`
`52`		`-`
`53`		`- logits.resize(maxNumSequences);`
`54`	`52`	`}`
`55`	`53`
`56`	`54`	`void DecoderInputBuffers::setupMedusaLogits(SizeType32 maxNumSequences, ModelConfig const& modelConfig)`