|
24 | 24 | #include "tensorrt_llm/runtime/common.h" |
25 | 25 | #include "tensorrt_llm/runtime/iTensor.h" |
26 | 26 | #include "tensorrt_llm/runtime/modelConfig.h" |
27 | | -#include "tensorrt_llm/runtime/request.h" |
28 | 27 | #include "tensorrt_llm/runtime/worldConfig.h" |
29 | 28 |
|
30 | 29 | namespace tensorrt_llm::runtime |
@@ -88,37 +87,6 @@ class CreateNewDecoderRequests : Algorithm |
88 | 87 | SizeType32 maxSequenceLength, OptionalRef<MedusaBuffers const> medusaBuffers) const; |
89 | 88 |
|
90 | 89 | private: |
91 | | - //! @brief Setups decoder internal tensors for new speculative decoding request |
92 | | - static void newRequestSpeculativeDecoding(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, |
93 | | - SamplingConfig const& samplingConfig, runtime::ModelConfig const& modelConfig, |
94 | | - DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream, |
95 | | - CudaStream const& decoderStream, SpeculativeDecodingMode const& speculativeDecodingMode, |
96 | | - SizeType32 maxDecodingEngineTokens); |
97 | | - |
98 | | - //! @brief Setups decoder internal tensors for new request in Draft model Sps mode |
99 | | - static void newRequestDraftTokensExternal(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, |
100 | | - SamplingConfig const& samplingConfig, DecodingInput& jointDecodingInput, CudaStream const& decoderStream); |
101 | | - |
102 | | - //! @brief Setups decoder internal tensors for new Medusa request |
103 | | - static void newRequestMedusa(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, |
104 | | - DecodingInput& jointDecodingInput, CudaStream const& decoderStream, SizeType32 maxDecodingEngineTokens); |
105 | | - |
106 | | - //! @brief Setups decoder internal tensors for new Lookahead request |
107 | | - static void newRequestLookahead(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, |
108 | | - DecodingInput& jointDecodingInput, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); |
109 | | - |
110 | | - //! @brief Setups decoder internal tensors for new Explicit draft tokens request |
111 | | - static void newRequestExplicitDraftTokens(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, |
112 | | - DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); |
113 | | - |
114 | | - //! @brief Setups decoder internal tensors for new Eagle request |
115 | | - static void newRequestEagle(SizeType32 batchIdx, runtime::decoder_batch::Request const& request, |
116 | | - runtime::ModelConfig const& modelConfig, DecodingOutput& jointDecodingOutput, CudaStream const& runtimeStream); |
117 | | - |
118 | | - [[nodiscard]] std::shared_ptr<runtime::ITensor> retrieveDraftLogits(runtime::ModelConfig const& modelConfig, |
119 | | - runtime::WorldConfig const& worldConfig, std::shared_ptr<runtime::ITensor> const& tensor, |
120 | | - runtime::BufferManager const& bufferManager) const; |
121 | | - |
122 | 90 | bool mSpeculativeDecodingFastLogits; |
123 | 91 | bool mIsLeaderInOrchMode; |
124 | 92 | bool mIsNormalizeLogProbs; |
|
0 commit comments