Skip to content

Commit 98d385a

Browse files
hiworldwzjwangzaijun
andauthored
moe triton kernel use tma. (#1100)
Co-authored-by: wangzaijun <[email protected]>
1 parent 7391a99 commit 98d385a

14 files changed

+344
-206
lines changed

lightllm/common/fused_moe/grouped_fused_moe.py

Lines changed: 197 additions & 89 deletions
Large diffs are not rendered by default.

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=256,N=7168,expert_num=256,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json

Lines changed: 15 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
"BLOCK_SIZE_K": 128,
44
"BLOCK_SIZE_M": 16,
55
"BLOCK_SIZE_N": 128,
6-
"GROUP_SIZE_M": 64,
6+
"GROUP_SIZE_M": 32,
77
"NEED_TRANS": true,
88
"num_stages": 2,
99
"num_warps": 4
@@ -17,15 +17,6 @@
1717
"num_stages": 2,
1818
"num_warps": 4
1919
},
20-
"131072": {
21-
"BLOCK_SIZE_K": 128,
22-
"BLOCK_SIZE_M": 64,
23-
"BLOCK_SIZE_N": 128,
24-
"GROUP_SIZE_M": 16,
25-
"NEED_TRANS": false,
26-
"num_stages": 3,
27-
"num_warps": 4
28-
},
2920
"16384": {
3021
"BLOCK_SIZE_K": 128,
3122
"BLOCK_SIZE_M": 64,
@@ -36,12 +27,12 @@
3627
"num_warps": 4
3728
},
3829
"2048": {
39-
"BLOCK_SIZE_K": 128,
30+
"BLOCK_SIZE_K": 64,
4031
"BLOCK_SIZE_M": 16,
4132
"BLOCK_SIZE_N": 128,
42-
"GROUP_SIZE_M": 64,
33+
"GROUP_SIZE_M": 32,
4334
"NEED_TRANS": true,
44-
"num_stages": 2,
35+
"num_stages": 3,
4536
"num_warps": 4
4637
},
4738
"256": {
@@ -53,15 +44,6 @@
5344
"num_stages": 2,
5445
"num_warps": 4
5546
},
56-
"32": {
57-
"BLOCK_SIZE_K": 128,
58-
"BLOCK_SIZE_M": 16,
59-
"BLOCK_SIZE_N": 128,
60-
"GROUP_SIZE_M": 64,
61-
"NEED_TRANS": true,
62-
"num_stages": 2,
63-
"num_warps": 4
64-
},
6547
"32768": {
6648
"BLOCK_SIZE_K": 128,
6749
"BLOCK_SIZE_M": 64,
@@ -89,13 +71,22 @@
8971
"num_stages": 2,
9072
"num_warps": 4
9173
},
74+
"67584": {
75+
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_M": 64,
77+
"BLOCK_SIZE_N": 128,
78+
"GROUP_SIZE_M": 16,
79+
"NEED_TRANS": false,
80+
"num_stages": 3,
81+
"num_warps": 4
82+
},
9283
"8": {
9384
"BLOCK_SIZE_K": 64,
9485
"BLOCK_SIZE_M": 16,
9586
"BLOCK_SIZE_N": 128,
96-
"GROUP_SIZE_M": 64,
87+
"GROUP_SIZE_M": 32,
9788
"NEED_TRANS": true,
98-
"num_stages": 3,
89+
"num_stages": 2,
9990
"num_warps": 4
10091
},
10192
"800": {

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=256,N=7168,expert_num=257,mul_routed_weight=true,out_dtype=torch.bfloat16,topk_num=1,use_fp8_w8a8=true}_NVIDIA_H200.json

Lines changed: 12 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -17,20 +17,11 @@
1717
"num_stages": 2,
1818
"num_warps": 4
1919
},
20-
"147456": {
21-
"BLOCK_SIZE_K": 128,
22-
"BLOCK_SIZE_M": 64,
23-
"BLOCK_SIZE_N": 128,
24-
"GROUP_SIZE_M": 16,
25-
"NEED_TRANS": false,
26-
"num_stages": 3,
27-
"num_warps": 4
28-
},
2920
"18432": {
3021
"BLOCK_SIZE_K": 128,
3122
"BLOCK_SIZE_M": 64,
3223
"BLOCK_SIZE_N": 128,
33-
"GROUP_SIZE_M": 32,
24+
"GROUP_SIZE_M": 64,
3425
"NEED_TRANS": false,
3526
"num_stages": 3,
3627
"num_warps": 4
@@ -53,15 +44,6 @@
5344
"num_stages": 2,
5445
"num_warps": 4
5546
},
56-
"36": {
57-
"BLOCK_SIZE_K": 128,
58-
"BLOCK_SIZE_M": 16,
59-
"BLOCK_SIZE_N": 128,
60-
"GROUP_SIZE_M": 64,
61-
"NEED_TRANS": true,
62-
"num_stages": 2,
63-
"num_warps": 4
64-
},
6547
"36864": {
6648
"BLOCK_SIZE_K": 128,
6749
"BLOCK_SIZE_M": 64,
@@ -89,13 +71,22 @@
8971
"num_stages": 2,
9072
"num_warps": 4
9173
},
74+
"76032": {
75+
"BLOCK_SIZE_K": 128,
76+
"BLOCK_SIZE_M": 64,
77+
"BLOCK_SIZE_N": 128,
78+
"GROUP_SIZE_M": 16,
79+
"NEED_TRANS": false,
80+
"num_stages": 3,
81+
"num_warps": 4
82+
},
9283
"9": {
9384
"BLOCK_SIZE_K": 64,
9485
"BLOCK_SIZE_M": 16,
9586
"BLOCK_SIZE_N": 128,
9687
"GROUP_SIZE_M": 64,
9788
"NEED_TRANS": true,
98-
"num_stages": 3,
89+
"num_stages": 2,
9990
"num_warps": 4
10091
},
10192
"900": {
@@ -111,7 +102,7 @@
111102
"BLOCK_SIZE_K": 128,
112103
"BLOCK_SIZE_M": 64,
113104
"BLOCK_SIZE_N": 128,
114-
"GROUP_SIZE_M": 64,
105+
"GROUP_SIZE_M": 32,
115106
"NEED_TRANS": false,
116107
"num_stages": 3,
117108
"num_warps": 4

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=7168,N=512,expert_num=256,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=8,use_fp8_w8a8=true}_NVIDIA_H200.json

Lines changed: 22 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,16 @@
1212
"BLOCK_SIZE_K": 128,
1313
"BLOCK_SIZE_M": 16,
1414
"BLOCK_SIZE_N": 64,
15-
"GROUP_SIZE_M": 16,
15+
"GROUP_SIZE_M": 1,
1616
"NEED_TRANS": true,
17-
"num_stages": 4,
17+
"num_stages": 5,
1818
"num_warps": 4
1919
},
2020
"1024": {
2121
"BLOCK_SIZE_K": 128,
2222
"BLOCK_SIZE_M": 64,
2323
"BLOCK_SIZE_N": 128,
24-
"GROUP_SIZE_M": 16,
24+
"GROUP_SIZE_M": 1,
2525
"NEED_TRANS": false,
2626
"num_stages": 4,
2727
"num_warps": 4
@@ -30,34 +30,25 @@
3030
"BLOCK_SIZE_K": 128,
3131
"BLOCK_SIZE_M": 16,
3232
"BLOCK_SIZE_N": 128,
33-
"GROUP_SIZE_M": 64,
33+
"GROUP_SIZE_M": 16,
3434
"NEED_TRANS": true,
3535
"num_stages": 5,
3636
"num_warps": 4
3737
},
3838
"16": {
3939
"BLOCK_SIZE_K": 128,
4040
"BLOCK_SIZE_M": 16,
41-
"BLOCK_SIZE_N": 128,
41+
"BLOCK_SIZE_N": 64,
4242
"GROUP_SIZE_M": 32,
4343
"NEED_TRANS": true,
44-
"num_stages": 4,
45-
"num_warps": 4
46-
},
47-
"16384": {
48-
"BLOCK_SIZE_K": 128,
49-
"BLOCK_SIZE_M": 64,
50-
"BLOCK_SIZE_N": 128,
51-
"GROUP_SIZE_M": 16,
52-
"NEED_TRANS": false,
53-
"num_stages": 4,
44+
"num_stages": 3,
5445
"num_warps": 4
5546
},
5647
"2048": {
5748
"BLOCK_SIZE_K": 128,
5849
"BLOCK_SIZE_M": 64,
5950
"BLOCK_SIZE_N": 128,
60-
"GROUP_SIZE_M": 1,
51+
"GROUP_SIZE_M": 16,
6152
"NEED_TRANS": false,
6253
"num_stages": 4,
6354
"num_warps": 4
@@ -66,27 +57,18 @@
6657
"BLOCK_SIZE_K": 128,
6758
"BLOCK_SIZE_M": 16,
6859
"BLOCK_SIZE_N": 128,
69-
"GROUP_SIZE_M": 64,
60+
"GROUP_SIZE_M": 16,
7061
"NEED_TRANS": true,
7162
"num_stages": 3,
7263
"num_warps": 4
7364
},
7465
"32": {
7566
"BLOCK_SIZE_K": 128,
76-
"BLOCK_SIZE_M": 16,
67+
"BLOCK_SIZE_M": 32,
7768
"BLOCK_SIZE_N": 64,
78-
"GROUP_SIZE_M": 32,
79-
"NEED_TRANS": true,
80-
"num_stages": 4,
81-
"num_warps": 4
82-
},
83-
"4": {
84-
"BLOCK_SIZE_K": 128,
85-
"BLOCK_SIZE_M": 16,
86-
"BLOCK_SIZE_N": 64,
87-
"GROUP_SIZE_M": 16,
69+
"GROUP_SIZE_M": 1,
8870
"NEED_TRANS": true,
89-
"num_stages": 5,
71+
"num_stages": 3,
9072
"num_warps": 4
9173
},
9274
"4096": {
@@ -102,9 +84,9 @@
10284
"BLOCK_SIZE_K": 128,
10385
"BLOCK_SIZE_M": 16,
10486
"BLOCK_SIZE_N": 64,
105-
"GROUP_SIZE_M": 32,
87+
"GROUP_SIZE_M": 64,
10688
"NEED_TRANS": true,
107-
"num_stages": 4,
89+
"num_stages": 3,
10890
"num_warps": 4
10991
},
11092
"8": {
@@ -113,6 +95,15 @@
11395
"BLOCK_SIZE_N": 64,
11496
"GROUP_SIZE_M": 32,
11597
"NEED_TRANS": true,
98+
"num_stages": 5,
99+
"num_warps": 4
100+
},
101+
"8448": {
102+
"BLOCK_SIZE_K": 128,
103+
"BLOCK_SIZE_M": 64,
104+
"BLOCK_SIZE_N": 128,
105+
"GROUP_SIZE_M": 16,
106+
"NEED_TRANS": false,
116107
"num_stages": 4,
117108
"num_warps": 4
118109
}

lightllm/common/triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200/grouped_matmul:v1/{K=7168,N=512,expert_num=257,mul_routed_weight=false,out_dtype=torch.bfloat16,topk_num=9,use_fp8_w8a8=true}_NVIDIA_H200.json

Lines changed: 28 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"BLOCK_SIZE_N": 64,
66
"GROUP_SIZE_M": 1,
77
"NEED_TRANS": true,
8-
"num_stages": 4,
8+
"num_stages": 5,
99
"num_warps": 4
1010
},
1111
"100": {
@@ -18,39 +18,30 @@
1818
"num_warps": 4
1919
},
2020
"1024": {
21-
"BLOCK_SIZE_K": 128,
21+
"BLOCK_SIZE_K": 64,
2222
"BLOCK_SIZE_M": 64,
23-
"BLOCK_SIZE_N": 64,
24-
"GROUP_SIZE_M": 1,
23+
"BLOCK_SIZE_N": 128,
24+
"GROUP_SIZE_M": 16,
2525
"NEED_TRANS": false,
26-
"num_stages": 4,
26+
"num_stages": 5,
2727
"num_warps": 4
2828
},
2929
"128": {
3030
"BLOCK_SIZE_K": 128,
3131
"BLOCK_SIZE_M": 16,
32-
"BLOCK_SIZE_N": 128,
33-
"GROUP_SIZE_M": 16,
32+
"BLOCK_SIZE_N": 64,
33+
"GROUP_SIZE_M": 1,
3434
"NEED_TRANS": true,
35-
"num_stages": 3,
35+
"num_stages": 5,
3636
"num_warps": 4
3737
},
3838
"16": {
3939
"BLOCK_SIZE_K": 128,
40-
"BLOCK_SIZE_M": 32,
41-
"BLOCK_SIZE_N": 128,
42-
"GROUP_SIZE_M": 16,
40+
"BLOCK_SIZE_M": 16,
41+
"BLOCK_SIZE_N": 64,
42+
"GROUP_SIZE_M": 1,
4343
"NEED_TRANS": true,
44-
"num_stages": 3,
45-
"num_warps": 4
46-
},
47-
"16384": {
48-
"BLOCK_SIZE_K": 128,
49-
"BLOCK_SIZE_M": 64,
50-
"BLOCK_SIZE_N": 128,
51-
"GROUP_SIZE_M": 16,
52-
"NEED_TRANS": false,
53-
"num_stages": 4,
44+
"num_stages": 5,
5445
"num_warps": 4
5546
},
5647
"2048": {
@@ -64,29 +55,20 @@
6455
},
6556
"256": {
6657
"BLOCK_SIZE_K": 128,
67-
"BLOCK_SIZE_M": 32,
58+
"BLOCK_SIZE_M": 16,
6859
"BLOCK_SIZE_N": 128,
69-
"GROUP_SIZE_M": 16,
60+
"GROUP_SIZE_M": 1,
7061
"NEED_TRANS": true,
7162
"num_stages": 4,
7263
"num_warps": 4
7364
},
7465
"32": {
7566
"BLOCK_SIZE_K": 128,
76-
"BLOCK_SIZE_M": 16,
67+
"BLOCK_SIZE_M": 32,
7768
"BLOCK_SIZE_N": 64,
7869
"GROUP_SIZE_M": 16,
7970
"NEED_TRANS": true,
80-
"num_stages": 4,
81-
"num_warps": 4
82-
},
83-
"4": {
84-
"BLOCK_SIZE_K": 128,
85-
"BLOCK_SIZE_M": 16,
86-
"BLOCK_SIZE_N": 64,
87-
"GROUP_SIZE_M": 32,
88-
"NEED_TRANS": true,
89-
"num_stages": 5,
71+
"num_stages": 3,
9072
"num_warps": 4
9173
},
9274
"4096": {
@@ -102,17 +84,26 @@
10284
"BLOCK_SIZE_K": 128,
10385
"BLOCK_SIZE_M": 16,
10486
"BLOCK_SIZE_N": 64,
105-
"GROUP_SIZE_M": 1,
87+
"GROUP_SIZE_M": 16,
10688
"NEED_TRANS": true,
10789
"num_stages": 3,
10890
"num_warps": 4
10991
},
11092
"8": {
11193
"BLOCK_SIZE_K": 128,
112-
"BLOCK_SIZE_M": 32,
94+
"BLOCK_SIZE_M": 16,
11395
"BLOCK_SIZE_N": 64,
114-
"GROUP_SIZE_M": 1,
96+
"GROUP_SIZE_M": 16,
11597
"NEED_TRANS": true,
98+
"num_stages": 3,
99+
"num_warps": 4
100+
},
101+
"8448": {
102+
"BLOCK_SIZE_K": 128,
103+
"BLOCK_SIZE_M": 64,
104+
"BLOCK_SIZE_N": 128,
105+
"GROUP_SIZE_M": 16,
106+
"NEED_TRANS": false,
116107
"num_stages": 4,
117108
"num_warps": 4
118109
}

0 commit comments

Comments
 (0)