QiJune
diff --git a/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 6 additions & 0 deletions b/‎cpp/include/tensorrt_llm/common/cudaUtils.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 252 additions & 58 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.cpp‎
Lines changed: 252 additions & 58 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h‎
Lines changed: 20 additions & 16 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/KernelRunner.h‎
Lines changed: 20 additions & 16 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/.clang-format‎
Lines changed: 78 additions & 0 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/.clang-format‎
Lines changed: 78 additions & 0 deletions
diff --git a/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h‎
Lines changed: 11 additions & 2 deletions b/‎cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmEnums.h‎
Lines changed: 11 additions & 2 deletions
@@ -19,6 +19,9 @@
 #include "tensorrt_llm/common/cudaBf16Wrapper.h"
 #include "tensorrt_llm/common/cudaDriverWrapper.h"
 #include "tensorrt_llm/common/cudaFp8Utils.h"
+#if ENABLE_FP4
+#include <cuda_fp4.h>
+#endif
 #include "tensorrt_llm/common/logger.h"
 #include "tensorrt_llm/common/tllmException.h"
 #include <algorithm>
@@ -545,6 +548,9 @@ template void printArrayInfo(__nv_bfloat16 const* ptr, uint64_t nElement, std::s
 #ifdef ENABLE_FP8
 template void printArrayInfo(__nv_fp8_e4m3 const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 #endif
+#ifdef ENABLE_FP4
+template void printArrayInfo(__nv_fp4_e2m1 const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
+#endif
 template void printArrayInfo(uint32_t const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 template void printArrayInfo(uint64_t const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 template void printArrayInfo(int const* ptr, uint64_t nElement, std::string name, bool const bPrintElement);
 
@@ -68,50 +68,54 @@ class TrtllmGenBatchedGemmRunner
         int32_t configIndex) const;
 
     // Generic GEMM interface
-    void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, int32_t numTokens,
-        int32_t numBatches, int32_t maxNumCtasInBatchDim, void const* a, void const* sfA, void const* b,
-        void const* sfB, void const* perTokensSfA, void const* perTokensSfB, float const* scaleC,
-        float const* scaleGateC, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
-        float const* clampLimit, void* c, void* outSfC, int32_t const* routeMap, int32_t const* totalNumPaddedTokens,
-        int32_t const* ctaIdxXyToBatchIdx, int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas,
-        void* workspace, CUstream stream, int device, int32_t configIndex);
+    void run(int32_t m, int32_t n, int32_t k, int32_t validM, int32_t validN, int32_t validK,
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
+        void const* a, void const* sfA, void const* b, void const* sfB, void const* perTokensSfA,
+        void const* perTokensSfB, float const* scaleC, float const* scaleGateC, float const* bias,
+        float const* swiGluAlpha, float const* swiGluBeta, float const* clampLimit, void* c, void* outSfC,
+        int32_t const* routeMap, int32_t const* totalNumPaddedTokens, int32_t const* ctaIdxXyToBatchIdx,
+        int32_t const* ctaIdxXyToMnLimit, int32_t const* numNonExitingCtas, void* workspace, CUstream stream,
+        int device, int32_t configIndex);
 
     // Block-scaling GEMM
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
         void const* b, void const* sfB, void* c, void* outSfC, void* workspace, CUstream stream, int device,
-        int32_t configIndex);
+        int32_t configIndex, int32_t validM = -1, int32_t validN = -1, int32_t validK = -1);
 
     // Block-scaling GEMM with SwiGLU activation
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* sfA,
         void const* b, void const* sfB, float const* bias, float const* swiGluAlpha, float const* swiGluBeta,
         float const* clampLimit, void* c, void* outSfC, void* workspace, CUstream stream, int device,
-        int32_t configIndex);
+        int32_t configIndex, int32_t validM = -1, int32_t validN = -1, int32_t validK = -1);
 
     // FP8 per-tensor scaling GEMM
     void run(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, void const* a, void const* b,
         float const* scaleC, float const* scaleGateC, void* c, void* workspace, CUstream stream, int device,
-        int32_t configIndex);
+        int32_t configIndex, int32_t validM = -1, int32_t validN = -1, int32_t validK = -1);
 
     // Get the list of configs that passed the validation based on the constructor options
     [[nodiscard]] std::vector<int64_t> getPassingConfigIndices() const
     {
         return mPassingConfigIndices;
     }
 
+    // Get the kernel name from the config index
+    [[nodiscard]] std::string getKernelNameFromConfigIndex(int32_t configIndex) const;
+
     // Get the list of config indices that are valid for the given problem shape
     [[nodiscard]] std::vector<int64_t> getValidConfigIndices(int32_t m, int32_t n, int32_t k,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches,
-        int32_t maxNumCtasInBatchDim) const;
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
+        int32_t validM = -1, int32_t validN = -1, int32_t validK = -1) const;
 
     // Get a default config index that is valid for the given problem shape
     // This will be used as the fallback config if using auto-tuning
     [[nodiscard]] int64_t getDefaultValidConfigIndex(int32_t m, int32_t n, int32_t k,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches,
-        int32_t maxNumCtasInBatchDim) const;
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
+        int32_t validM = -1, int32_t validN = -1, int32_t validK = -1) const;
 
     [[nodiscard]] bool isValidConfigIndex(int32_t configIndex, int32_t m, int32_t n, int32_t k,
-        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches,
-        int32_t maxNumCtasInBatchDim) const;
+        std::vector<int32_t> const& batchedTokens, int32_t numTokens, int32_t numBatches, int32_t maxNumCtasInBatchDim,
+        int32_t validM = -1, int32_t validN = -1, int32_t validK = -1) const;
 
 private:
     void selectGemmConfig(int32_t m, int32_t n, int32_t k, std::vector<int32_t> const& batchedTokens, int32_t numTokens,
 
@@ -0,0 +1,78 @@
+---
+AccessModifierOffset: -4
+AlignAfterOpenBracket: DontAlign
+AlignConsecutiveAssignments: None
+AlignConsecutiveDeclarations: None
+AlignOperands:   false
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: Empty
+AllowShortCaseLabelsOnASingleLine: true
+AllowShortFunctionsOnASingleLine: Empty
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: Yes
+BasedOnStyle: None
+BinPackArguments: true
+BinPackParameters: true
+BreakBeforeBinaryOperators: All
+BreakBeforeBraces: Allman
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: true
+ColumnLimit:     120
+CommentPragmas:  '^ IWYU pragma:'
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: false
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+IncludeBlocks: Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+  - Regex:           '^(<|"(gtest|isl|json)/)'
+    Priority:        3
+  - Regex:           '.*'
+    Priority:        1
+IndentCaseLabels: false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+KeepEmptyLinesAtTheStartOfBlocks: true
+Language: Cpp
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 4
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PointerAlignment: Left
+QualifierAlignment: Right
+ReflowComments:  true
+SeparateDefinitionBlocks: Always
+SortIncludes:    false
+SpaceAfterCStyleCast: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  false
+SpacesInCStyleCastParentheses: false
+SpacesInContainerLiterals: true
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        c++14
+TabWidth:        4
+UseTab:          Never
@@ -16,8 +16,8 @@
  */
 #pragma once
 
-#include <cassert>
 #include <string>
+#include <cassert>
 
 namespace batchedGemm
 {
@@ -34,7 +34,9 @@ enum class RouteImpl
     // Use LDGSTS to do the routing
     Ldgsts = 1,
     // Use UTMALDG.GATHER4 to do the routing
-    Tma = 2
+    Tma = 2,
+    // Use LDG+STS to do the routing
+    LdgPlusSts = 3
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -60,6 +62,13 @@ inline bool doesRouteImplUseTma(RouteImpl mode)
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+inline bool doesRouteImplUseLdgPlusSts(RouteImpl mode)
+{
+    return (mode == RouteImpl::LdgPlusSts);
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace batchedGemm
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////