add moe_align_fused (#1054)

shihaobai · Claude Code · commit 2c4be50cdce3 · 2025-09-18T21:34:20.000+08:00
diff --git a/lightllm/common/fused_moe/grouped_fused_moe.py b/lightllm/common/fused_moe/grouped_fused_moe.py
@@ -219,6 +219,91 @@ def moe_align1(
     )
 
 
+@triton.jit
+def moe_align_fused_kernel(
+    topk_ids_ptr,  # [token_num, topk]
+    topk_weights_ptr,  # [token_num, topk]
+    expert_to_token_index_ptr,  # [expert_num, token_num * topk]
+    expert_to_weight_ptr,  # [expert_num, token_num * topk]
+    expert_token_num_ptr,  # [expert_num]
+    token_num,
+    topk_num: tl.constexpr,
+    BLOCK_SIZE: tl.constexpr,
+):
+    token_block = tl.program_id(0)
+    offs = token_block * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
+    mask = offs < token_num * topk_num
+
+    expert_ids = tl.load(topk_ids_ptr + offs, mask=mask, other=0)
+    weights = tl.load(topk_weights_ptr + offs, mask=mask, other=0.0)
+
+    # 用 atomic_add 给 expert 分配写位置
+    write_pos = tl.atomic_add(expert_token_num_ptr + expert_ids, 1, mask=mask)
+
+    # 按 token 顺序写 index 和 weight
+    tl.store(
+        expert_to_token_index_ptr + expert_ids * (token_num * topk_num) + write_pos,
+        offs,
+        mask=mask,
+    )
+    tl.store(
+        expert_to_weight_ptr + expert_ids * (token_num * topk_num) + write_pos,
+        weights,
+        mask=mask,
+    )
+
+
+def _get_moe_align_fused_static_key(
+    topk_weights: torch.Tensor,
+) -> dict:
+    topk_num = topk_weights.shape[1]
+    return {
+        "topk_num": topk_num,
+    }
+
+
+def _get_moe_align_fused_configs():
+    return [
+        {
+            "BLOCK_SIZE": bt,
+            "num_warps": nw,
+        }
+        for nw in [1, 2, 4, 8]
+        for bt in [128, 256, 512, 1024, 2048]
+    ]
+
+
+@autotune(
+    kernel_name="moe_align_fused:v1",
+    configs_gen_func=_get_moe_align_fused_configs,
+    static_key_func=_get_moe_align_fused_static_key,
+    run_key_func=lambda topk_ids: topk_ids.shape[0],
+    mutates_args=["expert_to_token_index", "expert_to_weight", "expert_token_num"],
+)
+def moe_align_fused(
+    expert_to_token_index, expert_to_weight, expert_token_num, topk_ids, topk_weights, run_config: Optional[dict] = None
+):
+    token_num, topk_num = topk_ids.shape
+    if run_config is None:
+        run_config = {}
+    BLOCK_SIZE = run_config.get("BLOCK_SIZE", 256)
+    num_warps = run_config.get("num_warps", 4)
+
+    grid = (triton.cdiv(token_num * topk_num, BLOCK_SIZE),)
+    moe_align_fused_kernel[grid](
+        topk_ids,
+        topk_weights,
+        expert_to_token_index,
+        expert_to_weight,
+        expert_token_num,
+        token_num,
+        topk_num,
+        BLOCK_SIZE=BLOCK_SIZE,
+        num_warps=num_warps,
+    )
+    return expert_to_token_index, expert_to_weight, expert_token_num
+
+
 @triton.jit
 def moe_align2_kernel(
     experts_token_num_ptr,  # [expert_num,]
@@ -719,9 +804,14 @@ def fused_experts_impl(
 
         expert_to_tokens = torch.empty((E, topk_num * tokens_in_chunk), dtype=torch.int32, device="cuda")
         expert_to_weights = torch.empty((E, topk_num * tokens_in_chunk), dtype=torch.float32, device="cuda")
-        moe_align(topk_ids=curr_topk_ids, out=expert_to_tokens)
-        expert_to_token_num = torch.empty((E,), dtype=torch.int32, device="cuda")
-        moe_align1(expert_to_tokens, curr_topk_weights, expert_to_weights, expert_to_token_num, topk=topk_num)
+        expert_to_token_num = torch.zeros((E,), dtype=torch.int32, device="cuda")
+        moe_align_fused(
+            expert_to_token_index=expert_to_tokens,
+            expert_to_weight=expert_to_weights,
+            expert_token_num=expert_to_token_num,
+            topk_ids=curr_topk_ids,
+            topk_weights=curr_topk_weights,
+        )
 
         reused_mblock_infos = grouped_matmul(
             curr_topk_ids.numel(),
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=8}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=8}_NVIDIA_H200.json
@@ -0,0 +1,50 @@
+{
+  "1": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "128": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "16": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "2048": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  }
+}
diff --git a/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=9}_NVIDIA_H200.json b/lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/moe_align_fused:v1/{topk_num=9}_NVIDIA_H200.json
@@ -0,0 +1,50 @@
+{
+  "1": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "100": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "1024": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 4
+  },
+  "128": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "16": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "16384": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "2048": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "256": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  },
+  "32": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "4096": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "64": {
+    "BLOCK_SIZE": 128,
+    "num_warps": 4
+  },
+  "8": {
+    "BLOCK_SIZE": 256,
+    "num_warps": 8
+  }
+}