fix numBlocksCoop

syuoni · syuoni · commit b86561e498cb · 2025-11-21T02:50:38.000Z
Signed-off-by: Enwei Zhu &lt;21126786+syuoni@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/RoutingDeepSeek.cu
@@ -647,8 +647,7 @@ void run(Data& data, void* stream)
     //
     // The upper bound is a strict requirement. The number of blocks should be determined by querying
     // the device properties, or conservatively low.
-    // /!\ The following number is not portable!! (but works on H100 and B200)
-    int const numBlocksCoop = 128;
+    static int const numBlocksCoop = tensorrt_llm::common::getMultiProcessorCount();
 
     // Maximum number of tokens supported by the kernel using a cooperative launch.
     int const maxTokensCoop = (numBlocksCoop * numThreadsHist * 64) / data.mTopK;