[TRTLLM-9082][feat] AutoDeploy: Move the moe Align kernel to AOT (#9106)

nvchenghaoz · web-flow · commit 564989865c83 · 2025-11-21T16:05:48.000-08:00
Signed-off-by: Chenghao Zhang &lt;211069071+nvchenghaoz@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/moeAlignKernels.cu b/cpp/tensorrt_llm/kernels/moeAlignKernels.cu
@@ -1,27 +1,30 @@
-// Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
-// Licensed under the Apache License, Version 2.0 (the "License");
-// http://www.apache.org/licenses/LICENSE-2.0
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 
 // Inspired by vLLM's moe_align_kernel.cu and ported to TensorRT-LLM
 
-#include <ATen/ATen.h>
-#include <ATen/cuda/Atomic.cuh>
-#include <ATen/cuda/CUDAContext.h>
-#include <c10/cuda/CUDAGuard.h>
+#include "moeAlignKernels.h"
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
 #include <cub/cub.cuh>
-#include <torch/extension.h>
 
 #define CEILDIV(x, y) (((x) + (y) -1) / (y))
 #define WARP_SIZE 32
 
-namespace auto_deploy
-{
-namespace moe
+namespace tensorrt_llm::kernels
 {
 
 template <typename scalar_t>
@@ -204,68 +207,74 @@ __global__ void moe_align_block_size_small_batch_expert_kernel(scalar_t const* _
     }
 }
 
-} // namespace moe
-} // namespace auto_deploy
-
-void moe_align_block_size_cuda(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size,
-    torch::Tensor sorted_token_ids, torch::Tensor experts_ids, torch::Tensor num_tokens_post_pad)
+template <typename scalar_t>
+void invokeMoeAlignBlockSizeTyped(scalar_t const* topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids,
+    int32_t* num_tokens_post_pad, int32_t num_experts, int32_t block_size, int32_t numel, int32_t max_num_tokens_padded,
+    cudaStream_t stream)
 {
-
-    const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-
     int64_t padded_num_experts = ((num_experts + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
     int experts_per_warp = WARP_SIZE;
     int threads = 1024;
     threads = ((threads + WARP_SIZE - 1) / WARP_SIZE) * WARP_SIZE;
 
     // BlockScan uses 1024 threads and assigns one thread per expert.
-    TORCH_CHECK(padded_num_experts < 1024, "padded_num_experts must be less than 1024");
+    TLLM_CHECK_WITH_INFO(padded_num_experts < 1024, "padded_num_experts must be less than 1024");
 
-    AT_DISPATCH_INTEGRAL_TYPES(topk_ids.scalar_type(), "moe_align_block_size_kernel",
-        [&]
-        {
-            // calc needed amount of shared mem for `cumsum` tensors
-            auto options_int = torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
-            torch::Tensor cumsum_buffer = torch::empty({num_experts + 1}, options_int);
-            bool small_batch_expert_mode = (topk_ids.numel() < 1024) && (num_experts <= 64);
-
-            if (small_batch_expert_mode)
-            {
-                const int32_t threads = std::max((int32_t) num_experts, (int32_t) WARP_SIZE);
-                const int32_t shared_mem_size = ((threads + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
-
-                auto small_batch_expert_kernel
-                    = auto_deploy::moe::moe_align_block_size_small_batch_expert_kernel<scalar_t>;
-                small_batch_expert_kernel<<<1, threads, shared_mem_size, stream>>>(topk_ids.data_ptr<scalar_t>(),
-                    sorted_token_ids.data_ptr<int32_t>(), experts_ids.data_ptr<int32_t>(),
-                    num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size, topk_ids.numel(),
-                    sorted_token_ids.size(0));
-            }
-            else
-            {
-                auto align_kernel = auto_deploy::moe::moe_align_block_size_kernel<scalar_t>;
-
-                size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
-                size_t shared_mem_size = num_warps * experts_per_warp * sizeof(int32_t);
-
-                align_kernel<<<1, threads, shared_mem_size, stream>>>(topk_ids.data_ptr<scalar_t>(),
-                    sorted_token_ids.data_ptr<int32_t>(), experts_ids.data_ptr<int32_t>(),
-                    num_tokens_post_pad.data_ptr<int32_t>(), num_experts, padded_num_experts, experts_per_warp,
-                    block_size, topk_ids.numel(), cumsum_buffer.data_ptr<int32_t>(), sorted_token_ids.size(0));
-
-                const int block_threads = std::min(256, (int) threads);
-                const int num_blocks = (topk_ids.numel() + block_threads - 1) / block_threads;
-                const int max_blocks = 65535;
-                const int actual_blocks = std::min(num_blocks, max_blocks);
-
-                auto sort_kernel = auto_deploy::moe::count_and_sort_expert_tokens_kernel<scalar_t>;
-                sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(topk_ids.data_ptr<scalar_t>(),
-                    sorted_token_ids.data_ptr<int32_t>(), cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
-            }
-        });
+    // Allocate temporary cumsum buffer
+    int32_t* cumsum_buffer;
+    cudaMallocAsync(&cumsum_buffer, (num_experts + 1) * sizeof(int32_t), stream);
+    cudaMemsetAsync(cumsum_buffer, 0, (num_experts + 1) * sizeof(int32_t), stream);
+
+    bool small_batch_expert_mode = (numel < 1024) && (num_experts <= 64);
+
+    if (small_batch_expert_mode)
+    {
+        const int32_t thread_count = std::max((int32_t) num_experts, (int32_t) WARP_SIZE);
+        const int32_t shared_mem_size = ((thread_count + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
+
+        moe_align_block_size_small_batch_expert_kernel<scalar_t><<<1, thread_count, shared_mem_size, stream>>>(topk_ids,
+            sorted_token_ids, expert_ids, num_tokens_post_pad, num_experts, block_size, numel, max_num_tokens_padded);
+    }
+    else
+    {
+        size_t num_warps = CEILDIV(padded_num_experts, experts_per_warp);
+        size_t shared_mem_size = num_warps * experts_per_warp * sizeof(int32_t);
+
+        moe_align_block_size_kernel<scalar_t><<<1, threads, shared_mem_size, stream>>>(topk_ids, sorted_token_ids,
+            expert_ids, num_tokens_post_pad, num_experts, padded_num_experts, experts_per_warp, block_size, numel,
+            cumsum_buffer, max_num_tokens_padded);
+
+        int const block_threads = std::min(256, (int) threads);
+        int const num_blocks = (numel + block_threads - 1) / block_threads;
+        int const max_blocks = 65535;
+        int const actual_blocks = std::min(num_blocks, max_blocks);
+
+        count_and_sort_expert_tokens_kernel<scalar_t>
+            <<<actual_blocks, block_threads, 0, stream>>>(topk_ids, sorted_token_ids, cumsum_buffer, numel);
+    }
+
+    cudaFreeAsync(cumsum_buffer, stream);
 }
 
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m)
+void invokeMoeAlignBlockSize(void const* topk_ids, int32_t topk_ids_dtype_size, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* num_tokens_post_pad, int32_t num_experts, int32_t block_size, int32_t numel,
+    int32_t max_num_tokens_padded, cudaStream_t stream)
 {
-    m.def("moe_align_block_size", &moe_align_block_size_cuda, "MoE align block size (CUDA)");
+    // Dispatch based on dtype size
+    if (topk_ids_dtype_size == sizeof(int32_t))
+    {
+        invokeMoeAlignBlockSizeTyped(static_cast<int32_t const*>(topk_ids), sorted_token_ids, expert_ids,
+            num_tokens_post_pad, num_experts, block_size, numel, max_num_tokens_padded, stream);
+    }
+    else if (topk_ids_dtype_size == sizeof(int64_t))
+    {
+        invokeMoeAlignBlockSizeTyped(static_cast<int64_t const*>(topk_ids), sorted_token_ids, expert_ids,
+            num_tokens_post_pad, num_experts, block_size, numel, max_num_tokens_padded, stream);
+    }
+    else
+    {
+        TLLM_CHECK_WITH_INFO(false, "Unsupported topk_ids dtype size: %d", topk_ids_dtype_size);
+    }
 }
+
+} // namespace tensorrt_llm::kernels
diff --git a/cpp/tensorrt_llm/kernels/moeAlignKernels.h b/cpp/tensorrt_llm/kernels/moeAlignKernels.h
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <stdint.h>
+
+namespace tensorrt_llm::kernels
+{
+
+/**
+ * @brief Aligns token distribution across experts to be compatible with block size for matrix multiplication.
+ *
+ * This kernel sorts tokens by expert assignment and pads the distribution to match block size requirements.
+ * Inspired by vLLM's moe_align_kernel and ported to TensorRT-LLM.
+ *
+ * @param topk_ids Input tensor with expert IDs per token [total_tokens, top_k]
+ * @param topk_ids_dtype_size Size of the dtype (e.g., sizeof(int32_t) or sizeof(int64_t))
+ * @param sorted_token_ids Output tensor for sorted token indices
+ * @param expert_ids Output tensor for expert IDs per block
+ * @param num_tokens_post_pad Output tensor for total tokens after padding (single int32)
+ * @param num_experts Total number of experts
+ * @param block_size Block size for matrix multiplication alignment
+ * @param numel Total number of elements in topk_ids (topk_ids.numel())
+ * @param max_num_tokens_padded Maximum number of tokens after padding (sorted_token_ids.size(0))
+ * @param stream CUDA stream for kernel execution
+ */
+void invokeMoeAlignBlockSize(void const* topk_ids, int32_t topk_ids_dtype_size, int32_t* sorted_token_ids,
+    int32_t* expert_ids, int32_t* num_tokens_post_pad, int32_t num_experts, int32_t block_size, int32_t numel,
+    int32_t max_num_tokens_padded, cudaStream_t stream);
+
+} // namespace tensorrt_llm::kernels
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -78,6 +78,7 @@ add_library(
   moeCommOp.cpp
   moeAlltoAllOp.cpp
   moeLoadBalanceOp.cpp
+  moeAlignOp.cpp
   mxFp4BlockScaleMoe.cpp
   mxFp8Quantize.cpp
   fp8BlockScaleMoe.cpp
diff --git a/cpp/tensorrt_llm/thop/moeAlignOp.cpp b/cpp/tensorrt_llm/thop/moeAlignOp.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/kernels/moeAlignKernels.h"
+#include "thUtils.h"
+#include <torch/extension.h>
+
+namespace tk = tensorrt_llm::kernels;
+
+namespace torch_ext
+{
+
+void moeAlignBlockSizeOp(torch::Tensor topk_ids, int64_t num_experts, int64_t block_size,
+    torch::Tensor sorted_token_ids, torch::Tensor expert_ids, torch::Tensor num_tokens_post_pad)
+{
+    // Validate inputs
+    CHECK_TH_CUDA(topk_ids);
+    CHECK_CONTIGUOUS(topk_ids);
+    CHECK_INPUT(sorted_token_ids, torch::kInt32);
+    CHECK_INPUT(expert_ids, torch::kInt32);
+    CHECK_INPUT(num_tokens_post_pad, torch::kInt32);
+
+    TORCH_CHECK(topk_ids.scalar_type() == torch::kInt32 || topk_ids.scalar_type() == torch::kInt64,
+        "topk_ids must be int32 or int64");
+
+    auto stream = at::cuda::getCurrentCUDAStream();
+
+    tk::invokeMoeAlignBlockSize(topk_ids.data_ptr(), topk_ids.element_size(), sorted_token_ids.data_ptr<int32_t>(),
+        expert_ids.data_ptr<int32_t>(), num_tokens_post_pad.data_ptr<int32_t>(), static_cast<int32_t>(num_experts),
+        static_cast<int32_t>(block_size), static_cast<int32_t>(topk_ids.numel()),
+        static_cast<int32_t>(sorted_token_ids.size(0)), stream);
+}
+
+} // namespace torch_ext
+
+TORCH_LIBRARY_FRAGMENT(trtllm, m)
+{
+    m.def(
+        "moe_align_block_size(Tensor topk_ids, int num_experts, int block_size, "
+        "Tensor(a!) sorted_token_ids, Tensor(a!) expert_ids, Tensor(a!) num_tokens_post_pad) -> ()");
+}
+
+TORCH_LIBRARY_IMPL(trtllm, CUDA, m)
+{
+    m.impl("moe_align_block_size", &torch_ext::moeAlignBlockSizeOp);
+}
diff --git a/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py b/tensorrt_llm/_torch/auto_deploy/custom_ops/fused_moe/load_moe_align.py
@@ -1,40 +1,19 @@
 """
-Build moe_align CUDA extension eagerly with a persistent build directory
-(same workflow as agent_ops/load_moe.py).
-"""
-
-import os
-import tempfile
-
-import torch
-from torch.utils.cpp_extension import load
+AOT-compiled moe_align CUDA kernel.
 
-# Recommend explicit arch list so NVCC targets the right GPUs. You can override via env.
-os.environ.setdefault("TORCH_CUDA_ARCH_LIST", "8.0;8.6;8.9;9.0")
+The moe_align kernel is now compiled ahead-of-time (AOT) as part of the main
+TensorRT-LLM build instead of being JIT-compiled on first use. This reduces
+startup time and avoids compilation overhead.
 
-THIS_DIR = os.path.dirname(os.path.abspath(__file__))
+The kernel implementation is in:
+- cpp/tensorrt_llm/kernels/moeAlignKernels.cu
+- cpp/tensorrt_llm/kernels/moeAlignKernels.h
 
-# Use system temp directory to avoid environment variable dependency
-BUILD_DIR = os.path.join(tempfile.gettempdir(), "ad_cache", "auto_deploy", "fused_moe", "moe_align")
-os.makedirs(BUILD_DIR, exist_ok=True)
+The torch binding is in:
+- cpp/tensorrt_llm/thop/moeAlignOp.cpp
+"""
 
-moe_align_ext = load(
-    name="moe_align_ext",
-    sources=[os.path.join(THIS_DIR, "moe_align_kernel.cu")],
-    extra_cflags=["-O3"],
-    extra_cuda_cflags=[
-        "-O3",
-        "--use_fast_math",
-        "-U__CUDA_NO_HALF_OPERATORS__",
-        "-U__CUDA_NO_HALF_CONVERSIONS__",
-        "--expt-relaxed-constexpr",
-        # Optional: "-Xptxas=-v",
-    ],
-    verbose=True,
-    with_cuda=True,
-    build_directory=BUILD_DIR,
-    is_python_module=True,
-)
+import torch
 
 
 def moe_align_block_size(
@@ -46,7 +25,7 @@ def moe_align_block_size(
     num_tokens_post_pad: torch.Tensor,
 ):
     """
-    Wrapper for the CUDA moe_align_block_size function.
+    Wrapper for the AOT-compiled moe_align_block_size function.
 
     Aligns the token distribution across experts to be compatible with block
     size for matrix multiplication.
@@ -64,6 +43,7 @@ def moe_align_block_size(
         raise ValueError("topk_ids must be a CUDA tensor")
     if not topk_ids.is_contiguous():
         topk_ids = topk_ids.contiguous()
+
     for t, name in [
         (sorted_token_ids, "sorted_token_ids"),
         (expert_ids, "expert_ids"),
@@ -73,13 +53,15 @@ def moe_align_block_size(
             raise ValueError(f"{name} must be a CUDA tensor")
         if not t.is_contiguous():
             raise ValueError(f"{name} must be contiguous")
+
     if (
         sorted_token_ids.dtype != torch.int32
         or expert_ids.dtype != torch.int32
         or num_tokens_post_pad.dtype != torch.int32
     ):
         raise TypeError("sorted_token_ids, expert_ids, num_tokens_post_pad must be int32 tensors")
 
-    moe_align_ext.moe_align_block_size(
+    # Call the AOT-compiled kernel via torch ops
+    torch.ops.trtllm.moe_align_block_size(
         topk_ids, num_experts, block_size, sorted_token_ids, expert_ids, num_tokens_post_pad
     )