NVIDIA
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h‎
Lines changed: 0 additions & 32 deletions b/‎cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h‎
Lines changed: 0 additions & 32 deletions
diff --git a/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/include/tensorrt_llm/batch_manager/llmRequest.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/include/tensorrt_llm/runtime/decodingInput.h‎
Lines changed: 2 additions & 0 deletions b/‎cpp/include/tensorrt_llm/runtime/decodingInput.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎cpp/include/tensorrt_llm/runtime/request.h‎
Lines changed: 0 additions & 54 deletions b/‎cpp/include/tensorrt_llm/runtime/request.h‎
Lines changed: 0 additions & 54 deletions
@@ -24,7 +24,6 @@
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
-#include "tensorrt_llm/runtime/request.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
 namespace tensorrt_llm::runtime
@@ -88,37 +87,6 @@ class CreateNewDecoderRequests : Algorithm
         SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const;
 
 private:
-    //! @brief Setups decoder internal tensors for new speculative decoding request
-    static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig,
-        DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream,
-        CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode,
-        SizeType32 maxDecodingEngineTokens);
-
-    //! @brief Setups decoder internal tensors for new request in Draft model Sps mode
-    static void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream);
-
-    //! @brief Setups decoder internal tensors for new Medusa request
-    static void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens);
-
-    //! @brief Setups decoder internal tensors for new Lookahead request
-    static void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
-
-    //! @brief Setups decoder internal tensors for new Explicit draft tokens request
-    static void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
-
-    //! @brief Setups decoder internal tensors for new Eagle request
-    static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request,
-        runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream);
-
-    [[nodiscard]] std::shared_ptr<runtime::ITensor> retrieveDraftLogits(runtime::ModelConfig const& modelConfig,
-        runtime::WorldConfig const& worldConfig, std::shared_ptr<runtime::ITensor> const& tensor,
-        runtime::BufferManager const& bufferManager) const;
-
     bool mSpeculativeDecodingFastLogits;
     bool mIsLeaderInOrchMode;
     bool mIsNormalizeLogProbs;
 
@@ -1110,7 +1110,7 @@ class GenericLlmRequest
 
     [[nodiscard]] SizeType32 getNumDraftTokens() const
     {
-        return mDraftTokens->size();
+        return hasDraftTokens() ? mDraftTokens->size() : 0;
     }
 
     void discardDraftTokens(SizeType32 numTokensToDiscard)
 
@@ -102,11 +102,13 @@ class DecodingInput
     {
     public:
         TensorPtr draftLogits;
+        TensorPtr draftLogitsHost;
         TensorPtr draftProbs;
         TensorPtr targetProbs;
         TensorPtr numDraftTokens;
         TensorPtr numDraftTokensHost;
         TensorPtr draftTokenIds;
+        TensorPtr draftTokenIdsHost;
         TensorPtr useDraftLogits;
         TensorPtr useDraftLogitsHost;
Original file line number	Diff line number	Diff line change
`@@ -1110,7 +1110,7 @@ class GenericLlmRequest`
`1110`	`1110`
`1111`	`1111`	`[[nodiscard]] SizeType32 getNumDraftTokens() const`
`1112`	`1112`	`{`
`1113`		`- return mDraftTokens->size();`
	`1113`	`+ return hasDraftTokens() ? mDraftTokens->size() : 0;`
`1114`	`1114`	`}`
`1115`	`1115`
`1116`	`1116`	`void discardDraftTokens(SizeType32 numTokensToDiscard)`