Skip to content

Commit 810e37d

Browse files
authored
Merge branch 'main' into model_loader
2 parents 21ff657 + 40820e6 commit 810e37d

File tree

81 files changed

+4586
-938
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

81 files changed

+4586
-938
lines changed

3rdparty/xgrammar

Submodule xgrammar updated 114 files

cpp/kernels/xqa/gmma_impl.cuh

Lines changed: 464 additions & 2 deletions
Large diffs are not rendered by default.

cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp

Lines changed: 5 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -109,34 +109,20 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
109109
}
110110
case executor::GuidedDecodingParams::GuideType::kREGEX:
111111
{
112-
auto const& grammar = xgrammar::Grammar::FromRegex(guide.value());
113-
mXGrammarMatchers.at(seqSlot)
114-
= std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
112+
mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
113+
mXGrammarCompiler->CompileRegex(guide.value()));
115114
break;
116115
}
117116
case executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR:
118117
{
119-
auto const& grammar = xgrammar::Grammar::FromEBNF(guide.value());
120-
mXGrammarMatchers.at(seqSlot)
121-
= std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
118+
mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
119+
mXGrammarCompiler->CompileGrammar(guide.value()));
122120
break;
123121
}
124122
case executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG:
125123
{
126-
auto const& structuralTagParametersJson = nlohmann::json::parse(guide.value());
127-
auto const& structuralTagItemsJson
128-
= structuralTagParametersJson.at("structures").template get<std::vector<nlohmann::json>>();
129-
std::vector<xgrammar::StructuralTagItem> structuralTagItems;
130-
for (auto const& s : structuralTagItemsJson)
131-
{
132-
structuralTagItems.emplace_back(
133-
xgrammar::StructuralTagItem{s.at("begin").template get<std::string>(),
134-
s.at("schema").dump(), s.at("end").template get<std::string>()});
135-
}
136-
auto const& triggers
137-
= structuralTagParametersJson.at("triggers").template get<std::vector<std::string>>();
138124
mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
139-
mXGrammarCompiler->CompileStructuralTag(structuralTagItems, triggers));
125+
mXGrammarCompiler->CompileStructuralTag(guide.value()));
140126
break;
141127
}
142128
default:

cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu

Lines changed: 32 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -38,17 +38,18 @@ static constexpr int WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
3838

3939
////////////////////////////////////////////////////////////////////////////////////////////////////
4040

41-
template <typename T>
42-
__device__ T calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, T score, int32_t laneIdx, int32_t NumTopExperts)
41+
template <typename DataType>
42+
__device__ DataType calcSoftmax(
43+
cg::thread_block_tile<WARP_SIZE> const& warp, DataType score, int32_t laneIdx, int32_t NumTopExperts)
4344
{
44-
T maxScore = T{-INFINITY};
45+
float maxScore = -INFINITY;
4546
if (laneIdx < NumTopExperts)
4647
{
47-
maxScore = score >= maxScore ? score : maxScore;
48+
maxScore = float(score) >= maxScore ? float(score) : maxScore;
4849
}
49-
maxScore = cg::reduce(warp, maxScore, cg::greater<T>());
50+
maxScore = cg::reduce(warp, maxScore, cg::greater<float>());
5051

51-
float sumScore{0.f};
52+
float sumScore = 0.f;
5253
float newScore;
5354
// Get the summation of scores for each token
5455
if (laneIdx < NumTopExperts)
@@ -61,7 +62,7 @@ __device__ T calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, T score,
6162

6263
if (laneIdx < NumTopExperts)
6364
{
64-
score = static_cast<T>(newScore / sumScore);
65+
score = static_cast<DataType>(newScore / sumScore);
6566
}
6667

6768
return score;
@@ -70,31 +71,35 @@ __device__ T calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, T score,
7071
template <typename DataType, int VecSize>
7172
__device__ void calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, DataType (&scores)[VecSize])
7273
{
73-
DataType maxScore = DataType{-INFINITY};
74-
DataType sumScore = DataType{0.f};
75-
74+
// Compute in float to support half/bfloat16 inputs safely.
75+
float maxScore = -INFINITY;
76+
float sumScore = 0.f;
7677
// Get the max score for each token
7778
#pragma unroll
7879
for (int i = 0; i < VecSize; ++i)
7980
{
80-
maxScore = scores[i] >= maxScore ? scores[i] : maxScore;
81+
float si = static_cast<float>(scores[i]);
82+
maxScore = si >= maxScore ? si : maxScore;
8183
}
82-
maxScore = cg::reduce(warp, maxScore, cg::greater<DataType>());
84+
maxScore = cg::reduce(warp, maxScore, cg::greater<float>());
8385

8486
// Get the summation of scores for each token
8587
#pragma unroll
8688
for (int i = 0; i < VecSize; ++i)
8789
{
88-
scores[i] = static_cast<DataType>(exp(scores[i] - maxScore));
89-
sumScore += scores[i];
90+
float si = static_cast<float>(scores[i]);
91+
float e = expf(si - maxScore);
92+
scores[i] = static_cast<DataType>(e);
93+
sumScore += e;
9094
}
91-
sumScore = cg::reduce(warp, sumScore, cg::plus<DataType>());
95+
sumScore = cg::reduce(warp, sumScore, cg::plus<float>());
9296

9397
// Normalize the scores
9498
#pragma unroll
9599
for (int i = 0; i < VecSize; ++i)
96100
{
97-
scores[i] = static_cast<DataType>(scores[i] / sumScore);
101+
float si = static_cast<float>(scores[i]) / sumScore;
102+
scores[i] = static_cast<DataType>(si);
98103
}
99104
}
100105

@@ -205,7 +210,7 @@ int nextPowerOfTwo(int num)
205210
break;
206211

207212
template <typename InputT, typename OutputT, typename IdxT, bool DoSoftmaxBeforeTopK>
208-
void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
213+
void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
209214
int64_t const numExperts, int64_t const topK, cudaStream_t const stream)
210215
{
211216

@@ -249,20 +254,25 @@ void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* top
249254
}
250255

251256
#define INSTANTIATE_RENORM_MOE_ROUTING(InputT, OutputT, IdxT, DoSoftmaxBeforeTopK) \
252-
template void invokeRenormMoeRouting<InputT, OutputT, IdxT, DoSoftmaxBeforeTopK>(InputT * routerLogits, \
257+
template void invokeCustomMoeRouting<InputT, OutputT, IdxT, DoSoftmaxBeforeTopK>(InputT * routerLogits, \
253258
OutputT * topkValues, IdxT * topkIndices, int64_t const numTokens, int64_t const numExperts, \
254259
int64_t const topK, cudaStream_t const stream);
255260

256261
INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t, false);
257262
INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t, false);
258-
#ifdef ENABLE_BF16
259-
INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, false);
260-
#endif
261-
262263
INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t, true);
263264
INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t, true);
265+
264266
#ifdef ENABLE_BF16
267+
INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, false);
268+
INSTANTIATE_RENORM_MOE_ROUTING(float, __nv_bfloat16, int32_t, false);
269+
INSTANTIATE_RENORM_MOE_ROUTING(half, __nv_bfloat16, int32_t, false);
270+
INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, int32_t, false);
271+
265272
INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, true);
273+
INSTANTIATE_RENORM_MOE_ROUTING(float, __nv_bfloat16, int32_t, true);
274+
INSTANTIATE_RENORM_MOE_ROUTING(half, __nv_bfloat16, int32_t, true);
275+
INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, int32_t, true);
266276
#endif
267277

268278
} // namespace tensorrt_llm::kernels

cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,6 @@
2424
namespace tensorrt_llm::kernels
2525
{
2626
template <typename InputT, typename OutputT, typename IdxT, bool DoSoftmaxBeforeTopK>
27-
void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
27+
void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
2828
int64_t const numExperts, int64_t const topK, cudaStream_t const stream);
2929
} // namespace tensorrt_llm::kernels

cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -186,14 +186,14 @@ void run(Data const& data, void* stream)
186186
if (data.mUseDeepSeekFp8)
187187
{
188188
int const numThreads = 128;
189-
const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
189+
const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
190190

191191
LAUNCH(data, activationDeepSeekKernel, grid, numThreads, 0, stream);
192192
}
193193
else
194194
{
195195
int const numThreads = 256;
196-
const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
196+
const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
197197

198198
LAUNCH(data, activationKernel, grid, numThreads, 0, stream);
199199
}
@@ -371,7 +371,7 @@ void run(Data const& data, void* stream)
371371
constexpr int VecSize = 4;
372372
int const numThreads = 128;
373373
int const numBlocksX = (data.hiddenDimSf / VecSize - 1 + numThreads) / numThreads;
374-
int const numBlocksY = data.numTokens;
374+
int const numBlocksY = std::min(8192, data.numTokens);
375375
dim3 numBlocks(numBlocksX, numBlocksY);
376376
#define CONVERT_FP4_SF_LAUNCH(LayoutSrc, LayoutDst) \
377377
if (data.sfLayoutSrc == tg::SfLayout::LayoutSrc && data.sfLayoutDst == tg::SfLayout::LayoutDst) \
@@ -457,7 +457,7 @@ void run(Data const& data, void* stream)
457457
{
458458
int const numThreads = 256;
459459
int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads;
460-
int const numBlocksY = data.numTokens;
460+
int const numBlocksY = std::min(8192, data.numTokens);
461461
dim3 numBlocks(numBlocksX, numBlocksY);
462462

463463
LAUNCH(data, permuteKernel, numBlocks, numThreads, 0, stream);

0 commit comments

Comments
 (0)