QiJune
diff --git a/‎3rdparty/xgrammar‎ b/‎3rdparty/xgrammar‎
diff --git a/‎cpp/kernels/xqa/gmma_impl.cuh‎
Lines changed: 464 additions & 2 deletions b/‎cpp/kernels/xqa/gmma_impl.cuh‎
Lines changed: 464 additions & 2 deletions
diff --git a/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp‎
Lines changed: 5 additions & 19 deletions b/‎cpp/tensorrt_llm/batch_manager/guidedDecoder.cpp‎
Lines changed: 5 additions & 19 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu‎
Lines changed: 32 additions & 22 deletions b/‎cpp/tensorrt_llm/kernels/customMoeRoutingKernels.cu‎
Lines changed: 32 additions & 22 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h‎
Lines changed: 1 addition & 1 deletion b/‎cpp/tensorrt_llm/kernels/customMoeRoutingKernels.h‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu‎
Lines changed: 4 additions & 4 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/DevKernel.cu‎
Lines changed: 4 additions & 4 deletions
@@ -109,34 +109,20 @@ void GuidedDecoder::build(ScheduledRequests const& scheduledRequests)
                     }
                     case executor::GuidedDecodingParams::GuideType::kREGEX:
                     {
-                        auto const& grammar = xgrammar::Grammar::FromRegex(guide.value());
-                        mXGrammarMatchers.at(seqSlot)
-                            = std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
+                        mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
+                            mXGrammarCompiler->CompileRegex(guide.value()));
                         break;
                     }
                     case executor::GuidedDecodingParams::GuideType::kEBNF_GRAMMAR:
                     {
-                        auto const& grammar = xgrammar::Grammar::FromEBNF(guide.value());
-                        mXGrammarMatchers.at(seqSlot)
-                            = std::make_shared<xgrammar::GrammarMatcher>(mXGrammarCompiler->CompileGrammar(grammar));
+                        mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
+                            mXGrammarCompiler->CompileGrammar(guide.value()));
                         break;
                     }
                     case executor::GuidedDecodingParams::GuideType::kSTRUCTURAL_TAG:
                     {
-                        auto const& structuralTagParametersJson = nlohmann::json::parse(guide.value());
-                        auto const& structuralTagItemsJson
-                            = structuralTagParametersJson.at("structures").template get<std::vector<nlohmann::json>>();
-                        std::vector<xgrammar::StructuralTagItem> structuralTagItems;
-                        for (auto const& s : structuralTagItemsJson)
-                        {
-                            structuralTagItems.emplace_back(
-                                xgrammar::StructuralTagItem{s.at("begin").template get<std::string>(),
-                                    s.at("schema").dump(), s.at("end").template get<std::string>()});
-                        }
-                        auto const& triggers
-                            = structuralTagParametersJson.at("triggers").template get<std::vector<std::string>>();
                         mXGrammarMatchers.at(seqSlot) = std::make_shared<xgrammar::GrammarMatcher>(
-                            mXGrammarCompiler->CompileStructuralTag(structuralTagItems, triggers));
+                            mXGrammarCompiler->CompileStructuralTag(guide.value()));
                         break;
                     }
                     default:
 
@@ -38,17 +38,18 @@ static constexpr int WARPS_PER_BLOCK = BLOCK_SIZE / WARP_SIZE;
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-template <typename T>
-__device__ T calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, T score, int32_t laneIdx, int32_t NumTopExperts)
+template <typename DataType>
+__device__ DataType calcSoftmax(
+    cg::thread_block_tile<WARP_SIZE> const& warp, DataType score, int32_t laneIdx, int32_t NumTopExperts)
 {
-    T maxScore = T{-INFINITY};
+    float maxScore = -INFINITY;
     if (laneIdx < NumTopExperts)
     {
-        maxScore = score >= maxScore ? score : maxScore;
+        maxScore = float(score) >= maxScore ? float(score) : maxScore;
     }
-    maxScore = cg::reduce(warp, maxScore, cg::greater<T>());
+    maxScore = cg::reduce(warp, maxScore, cg::greater<float>());
 
-    float sumScore{0.f};
+    float sumScore = 0.f;
     float newScore;
     // Get the summation of scores for each token
     if (laneIdx < NumTopExperts)
@@ -61,7 +62,7 @@ __device__ T calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, T score,
 
     if (laneIdx < NumTopExperts)
     {
-        score = static_cast<T>(newScore / sumScore);
+        score = static_cast<DataType>(newScore / sumScore);
     }
 
     return score;
@@ -70,31 +71,35 @@ __device__ T calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, T score,
 template <typename DataType, int VecSize>
 __device__ void calcSoftmax(cg::thread_block_tile<WARP_SIZE> const& warp, DataType (&scores)[VecSize])
 {
-    DataType maxScore = DataType{-INFINITY};
-    DataType sumScore = DataType{0.f};
-
+    // Compute in float to support half/bfloat16 inputs safely.
+    float maxScore = -INFINITY;
+    float sumScore = 0.f;
     // Get the max score for each token
 #pragma unroll
     for (int i = 0; i < VecSize; ++i)
     {
-        maxScore = scores[i] >= maxScore ? scores[i] : maxScore;
+        float si = static_cast<float>(scores[i]);
+        maxScore = si >= maxScore ? si : maxScore;
     }
-    maxScore = cg::reduce(warp, maxScore, cg::greater<DataType>());
+    maxScore = cg::reduce(warp, maxScore, cg::greater<float>());
 
     // Get the summation of scores for each token
 #pragma unroll
     for (int i = 0; i < VecSize; ++i)
     {
-        scores[i] = static_cast<DataType>(exp(scores[i] - maxScore));
-        sumScore += scores[i];
+        float si = static_cast<float>(scores[i]);
+        float e = expf(si - maxScore);
+        scores[i] = static_cast<DataType>(e);
+        sumScore += e;
     }
-    sumScore = cg::reduce(warp, sumScore, cg::plus<DataType>());
+    sumScore = cg::reduce(warp, sumScore, cg::plus<float>());
 
     // Normalize the scores
 #pragma unroll
     for (int i = 0; i < VecSize; ++i)
     {
-        scores[i] = static_cast<DataType>(scores[i] / sumScore);
+        float si = static_cast<float>(scores[i]) / sumScore;
+        scores[i] = static_cast<DataType>(si);
     }
 }
 
@@ -205,7 +210,7 @@ int nextPowerOfTwo(int num)
         break;
 
 template <typename InputT, typename OutputT, typename IdxT, bool DoSoftmaxBeforeTopK>
-void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
+void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
     int64_t const numExperts, int64_t const topK, cudaStream_t const stream)
 {
 
@@ -249,20 +254,25 @@ void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* top
 }
 
 #define INSTANTIATE_RENORM_MOE_ROUTING(InputT, OutputT, IdxT, DoSoftmaxBeforeTopK)                                     \
-    template void invokeRenormMoeRouting<InputT, OutputT, IdxT, DoSoftmaxBeforeTopK>(InputT * routerLogits,            \
+    template void invokeCustomMoeRouting<InputT, OutputT, IdxT, DoSoftmaxBeforeTopK>(InputT * routerLogits,            \
         OutputT * topkValues, IdxT * topkIndices, int64_t const numTokens, int64_t const numExperts,                   \
         int64_t const topK, cudaStream_t const stream);
 
 INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t, false);
 INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t, false);
-#ifdef ENABLE_BF16
-INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, false);
-#endif
-
 INSTANTIATE_RENORM_MOE_ROUTING(float, float, int32_t, true);
 INSTANTIATE_RENORM_MOE_ROUTING(half, float, int32_t, true);
+
 #ifdef ENABLE_BF16
+INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, false);
+INSTANTIATE_RENORM_MOE_ROUTING(float, __nv_bfloat16, int32_t, false);
+INSTANTIATE_RENORM_MOE_ROUTING(half, __nv_bfloat16, int32_t, false);
+INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, int32_t, false);
+
 INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, float, int32_t, true);
+INSTANTIATE_RENORM_MOE_ROUTING(float, __nv_bfloat16, int32_t, true);
+INSTANTIATE_RENORM_MOE_ROUTING(half, __nv_bfloat16, int32_t, true);
+INSTANTIATE_RENORM_MOE_ROUTING(__nv_bfloat16, __nv_bfloat16, int32_t, true);
 #endif
 
 } // namespace tensorrt_llm::kernels
@@ -24,6 +24,6 @@
 namespace tensorrt_llm::kernels
 {
 template <typename InputT, typename OutputT, typename IdxT, bool DoSoftmaxBeforeTopK>
-void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
+void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,
     int64_t const numExperts, int64_t const topK, cudaStream_t const stream);
 } // namespace tensorrt_llm::kernels
@@ -186,14 +186,14 @@ void run(Data const& data, void* stream)
     if (data.mUseDeepSeekFp8)
     {
         int const numThreads = 128;
-        const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
+        const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
 
         LAUNCH(data, activationDeepSeekKernel, grid, numThreads, 0, stream);
     }
     else
     {
         int const numThreads = 256;
-        const dim3 grid(data.innerDim / 128, data.topK, data.numTokens);
+        const dim3 grid(data.innerDim / 128, data.topK, std::min(8192, data.numTokens));
 
         LAUNCH(data, activationKernel, grid, numThreads, 0, stream);
     }
@@ -371,7 +371,7 @@ void run(Data const& data, void* stream)
     constexpr int VecSize = 4;
     int const numThreads = 128;
     int const numBlocksX = (data.hiddenDimSf / VecSize - 1 + numThreads) / numThreads;
-    int const numBlocksY = data.numTokens;
+    int const numBlocksY = std::min(8192, data.numTokens);
     dim3 numBlocks(numBlocksX, numBlocksY);
 #define CONVERT_FP4_SF_LAUNCH(LayoutSrc, LayoutDst)                                                                    \
     if (data.sfLayoutSrc == tg::SfLayout::LayoutSrc && data.sfLayoutDst == tg::SfLayout::LayoutDst)                    \
@@ -457,7 +457,7 @@ void run(Data const& data, void* stream)
 {
     int const numThreads = 256;
     int const numBlocksX = (data.hiddenDim - 1 + numThreads) / numThreads;
-    int const numBlocksY = data.numTokens;
+    int const numBlocksY = std::min(8192, data.numTokens);
     dim3 numBlocks(numBlocksX, numBlocksY);
 
     LAUNCH(data, permuteKernel, numBlocks, numThreads, 0, stream);
Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,6 @@`
`24`	`24`	`namespace tensorrt_llm::kernels`
`25`	`25`	`{`
`26`	`26`	`template <typename InputT, typename OutputT, typename IdxT, bool DoSoftmaxBeforeTopK>`
`27`		`-void invokeRenormMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,`
	`27`	`+void invokeCustomMoeRouting(InputT* routerLogits, OutputT* topkValues, IdxT* topkIndices, int64_t const numTokens,`
`28`	`28`	`int64_t const numExperts, int64_t const topK, cudaStream_t const stream);`
`29`	`29`	`} // namespace tensorrt_llm::kernels`