File tree Expand file tree Collapse file tree 14 files changed +344
-206
lines changed
triton_utils/autotune_kernel_configs/triton_3.4.0/NVIDIA_H200
unit_tests/common/fused_moe Expand file tree Collapse file tree 14 files changed +344
-206
lines changed Load Diff Large diffs are not rendered by default.
Original file line number Diff line number Diff line change 33 "BLOCK_SIZE_K" : 128 ,
44 "BLOCK_SIZE_M" : 16 ,
55 "BLOCK_SIZE_N" : 128 ,
6- "GROUP_SIZE_M" : 64 ,
6+ "GROUP_SIZE_M" : 32 ,
77 "NEED_TRANS" : true ,
88 "num_stages" : 2 ,
99 "num_warps" : 4
1717 "num_stages" : 2 ,
1818 "num_warps" : 4
1919 },
20- "131072" : {
21- "BLOCK_SIZE_K" : 128 ,
22- "BLOCK_SIZE_M" : 64 ,
23- "BLOCK_SIZE_N" : 128 ,
24- "GROUP_SIZE_M" : 16 ,
25- "NEED_TRANS" : false ,
26- "num_stages" : 3 ,
27- "num_warps" : 4
28- },
2920 "16384" : {
3021 "BLOCK_SIZE_K" : 128 ,
3122 "BLOCK_SIZE_M" : 64 ,
3627 "num_warps" : 4
3728 },
3829 "2048" : {
39- "BLOCK_SIZE_K" : 128 ,
30+ "BLOCK_SIZE_K" : 64 ,
4031 "BLOCK_SIZE_M" : 16 ,
4132 "BLOCK_SIZE_N" : 128 ,
42- "GROUP_SIZE_M" : 64 ,
33+ "GROUP_SIZE_M" : 32 ,
4334 "NEED_TRANS" : true ,
44- "num_stages" : 2 ,
35+ "num_stages" : 3 ,
4536 "num_warps" : 4
4637 },
4738 "256" : {
5344 "num_stages" : 2 ,
5445 "num_warps" : 4
5546 },
56- "32" : {
57- "BLOCK_SIZE_K" : 128 ,
58- "BLOCK_SIZE_M" : 16 ,
59- "BLOCK_SIZE_N" : 128 ,
60- "GROUP_SIZE_M" : 64 ,
61- "NEED_TRANS" : true ,
62- "num_stages" : 2 ,
63- "num_warps" : 4
64- },
6547 "32768" : {
6648 "BLOCK_SIZE_K" : 128 ,
6749 "BLOCK_SIZE_M" : 64 ,
8971 "num_stages" : 2 ,
9072 "num_warps" : 4
9173 },
74+ "67584" : {
75+ "BLOCK_SIZE_K" : 128 ,
76+ "BLOCK_SIZE_M" : 64 ,
77+ "BLOCK_SIZE_N" : 128 ,
78+ "GROUP_SIZE_M" : 16 ,
79+ "NEED_TRANS" : false ,
80+ "num_stages" : 3 ,
81+ "num_warps" : 4
82+ },
9283 "8" : {
9384 "BLOCK_SIZE_K" : 64 ,
9485 "BLOCK_SIZE_M" : 16 ,
9586 "BLOCK_SIZE_N" : 128 ,
96- "GROUP_SIZE_M" : 64 ,
87+ "GROUP_SIZE_M" : 32 ,
9788 "NEED_TRANS" : true ,
98- "num_stages" : 3 ,
89+ "num_stages" : 2 ,
9990 "num_warps" : 4
10091 },
10192 "800" : {
Original file line number Diff line number Diff line change 1717 "num_stages" : 2 ,
1818 "num_warps" : 4
1919 },
20- "147456" : {
21- "BLOCK_SIZE_K" : 128 ,
22- "BLOCK_SIZE_M" : 64 ,
23- "BLOCK_SIZE_N" : 128 ,
24- "GROUP_SIZE_M" : 16 ,
25- "NEED_TRANS" : false ,
26- "num_stages" : 3 ,
27- "num_warps" : 4
28- },
2920 "18432" : {
3021 "BLOCK_SIZE_K" : 128 ,
3122 "BLOCK_SIZE_M" : 64 ,
3223 "BLOCK_SIZE_N" : 128 ,
33- "GROUP_SIZE_M" : 32 ,
24+ "GROUP_SIZE_M" : 64 ,
3425 "NEED_TRANS" : false ,
3526 "num_stages" : 3 ,
3627 "num_warps" : 4
5344 "num_stages" : 2 ,
5445 "num_warps" : 4
5546 },
56- "36" : {
57- "BLOCK_SIZE_K" : 128 ,
58- "BLOCK_SIZE_M" : 16 ,
59- "BLOCK_SIZE_N" : 128 ,
60- "GROUP_SIZE_M" : 64 ,
61- "NEED_TRANS" : true ,
62- "num_stages" : 2 ,
63- "num_warps" : 4
64- },
6547 "36864" : {
6648 "BLOCK_SIZE_K" : 128 ,
6749 "BLOCK_SIZE_M" : 64 ,
8971 "num_stages" : 2 ,
9072 "num_warps" : 4
9173 },
74+ "76032" : {
75+ "BLOCK_SIZE_K" : 128 ,
76+ "BLOCK_SIZE_M" : 64 ,
77+ "BLOCK_SIZE_N" : 128 ,
78+ "GROUP_SIZE_M" : 16 ,
79+ "NEED_TRANS" : false ,
80+ "num_stages" : 3 ,
81+ "num_warps" : 4
82+ },
9283 "9" : {
9384 "BLOCK_SIZE_K" : 64 ,
9485 "BLOCK_SIZE_M" : 16 ,
9586 "BLOCK_SIZE_N" : 128 ,
9687 "GROUP_SIZE_M" : 64 ,
9788 "NEED_TRANS" : true ,
98- "num_stages" : 3 ,
89+ "num_stages" : 2 ,
9990 "num_warps" : 4
10091 },
10192 "900" : {
111102 "BLOCK_SIZE_K" : 128 ,
112103 "BLOCK_SIZE_M" : 64 ,
113104 "BLOCK_SIZE_N" : 128 ,
114- "GROUP_SIZE_M" : 64 ,
105+ "GROUP_SIZE_M" : 32 ,
115106 "NEED_TRANS" : false ,
116107 "num_stages" : 3 ,
117108 "num_warps" : 4
Original file line number Diff line number Diff line change 1212 "BLOCK_SIZE_K" : 128 ,
1313 "BLOCK_SIZE_M" : 16 ,
1414 "BLOCK_SIZE_N" : 64 ,
15- "GROUP_SIZE_M" : 16 ,
15+ "GROUP_SIZE_M" : 1 ,
1616 "NEED_TRANS" : true ,
17- "num_stages" : 4 ,
17+ "num_stages" : 5 ,
1818 "num_warps" : 4
1919 },
2020 "1024" : {
2121 "BLOCK_SIZE_K" : 128 ,
2222 "BLOCK_SIZE_M" : 64 ,
2323 "BLOCK_SIZE_N" : 128 ,
24- "GROUP_SIZE_M" : 16 ,
24+ "GROUP_SIZE_M" : 1 ,
2525 "NEED_TRANS" : false ,
2626 "num_stages" : 4 ,
2727 "num_warps" : 4
3030 "BLOCK_SIZE_K" : 128 ,
3131 "BLOCK_SIZE_M" : 16 ,
3232 "BLOCK_SIZE_N" : 128 ,
33- "GROUP_SIZE_M" : 64 ,
33+ "GROUP_SIZE_M" : 16 ,
3434 "NEED_TRANS" : true ,
3535 "num_stages" : 5 ,
3636 "num_warps" : 4
3737 },
3838 "16" : {
3939 "BLOCK_SIZE_K" : 128 ,
4040 "BLOCK_SIZE_M" : 16 ,
41- "BLOCK_SIZE_N" : 128 ,
41+ "BLOCK_SIZE_N" : 64 ,
4242 "GROUP_SIZE_M" : 32 ,
4343 "NEED_TRANS" : true ,
44- "num_stages" : 4 ,
45- "num_warps" : 4
46- },
47- "16384" : {
48- "BLOCK_SIZE_K" : 128 ,
49- "BLOCK_SIZE_M" : 64 ,
50- "BLOCK_SIZE_N" : 128 ,
51- "GROUP_SIZE_M" : 16 ,
52- "NEED_TRANS" : false ,
53- "num_stages" : 4 ,
44+ "num_stages" : 3 ,
5445 "num_warps" : 4
5546 },
5647 "2048" : {
5748 "BLOCK_SIZE_K" : 128 ,
5849 "BLOCK_SIZE_M" : 64 ,
5950 "BLOCK_SIZE_N" : 128 ,
60- "GROUP_SIZE_M" : 1 ,
51+ "GROUP_SIZE_M" : 16 ,
6152 "NEED_TRANS" : false ,
6253 "num_stages" : 4 ,
6354 "num_warps" : 4
6657 "BLOCK_SIZE_K" : 128 ,
6758 "BLOCK_SIZE_M" : 16 ,
6859 "BLOCK_SIZE_N" : 128 ,
69- "GROUP_SIZE_M" : 64 ,
60+ "GROUP_SIZE_M" : 16 ,
7061 "NEED_TRANS" : true ,
7162 "num_stages" : 3 ,
7263 "num_warps" : 4
7364 },
7465 "32" : {
7566 "BLOCK_SIZE_K" : 128 ,
76- "BLOCK_SIZE_M" : 16 ,
67+ "BLOCK_SIZE_M" : 32 ,
7768 "BLOCK_SIZE_N" : 64 ,
78- "GROUP_SIZE_M" : 32 ,
79- "NEED_TRANS" : true ,
80- "num_stages" : 4 ,
81- "num_warps" : 4
82- },
83- "4" : {
84- "BLOCK_SIZE_K" : 128 ,
85- "BLOCK_SIZE_M" : 16 ,
86- "BLOCK_SIZE_N" : 64 ,
87- "GROUP_SIZE_M" : 16 ,
69+ "GROUP_SIZE_M" : 1 ,
8870 "NEED_TRANS" : true ,
89- "num_stages" : 5 ,
71+ "num_stages" : 3 ,
9072 "num_warps" : 4
9173 },
9274 "4096" : {
10284 "BLOCK_SIZE_K" : 128 ,
10385 "BLOCK_SIZE_M" : 16 ,
10486 "BLOCK_SIZE_N" : 64 ,
105- "GROUP_SIZE_M" : 32 ,
87+ "GROUP_SIZE_M" : 64 ,
10688 "NEED_TRANS" : true ,
107- "num_stages" : 4 ,
89+ "num_stages" : 3 ,
10890 "num_warps" : 4
10991 },
11092 "8" : {
11395 "BLOCK_SIZE_N" : 64 ,
11496 "GROUP_SIZE_M" : 32 ,
11597 "NEED_TRANS" : true ,
98+ "num_stages" : 5 ,
99+ "num_warps" : 4
100+ },
101+ "8448" : {
102+ "BLOCK_SIZE_K" : 128 ,
103+ "BLOCK_SIZE_M" : 64 ,
104+ "BLOCK_SIZE_N" : 128 ,
105+ "GROUP_SIZE_M" : 16 ,
106+ "NEED_TRANS" : false ,
116107 "num_stages" : 4 ,
117108 "num_warps" : 4
118109 }
Original file line number Diff line number Diff line change 55 "BLOCK_SIZE_N" : 64 ,
66 "GROUP_SIZE_M" : 1 ,
77 "NEED_TRANS" : true ,
8- "num_stages" : 4 ,
8+ "num_stages" : 5 ,
99 "num_warps" : 4
1010 },
1111 "100" : {
1818 "num_warps" : 4
1919 },
2020 "1024" : {
21- "BLOCK_SIZE_K" : 128 ,
21+ "BLOCK_SIZE_K" : 64 ,
2222 "BLOCK_SIZE_M" : 64 ,
23- "BLOCK_SIZE_N" : 64 ,
24- "GROUP_SIZE_M" : 1 ,
23+ "BLOCK_SIZE_N" : 128 ,
24+ "GROUP_SIZE_M" : 16 ,
2525 "NEED_TRANS" : false ,
26- "num_stages" : 4 ,
26+ "num_stages" : 5 ,
2727 "num_warps" : 4
2828 },
2929 "128" : {
3030 "BLOCK_SIZE_K" : 128 ,
3131 "BLOCK_SIZE_M" : 16 ,
32- "BLOCK_SIZE_N" : 128 ,
33- "GROUP_SIZE_M" : 16 ,
32+ "BLOCK_SIZE_N" : 64 ,
33+ "GROUP_SIZE_M" : 1 ,
3434 "NEED_TRANS" : true ,
35- "num_stages" : 3 ,
35+ "num_stages" : 5 ,
3636 "num_warps" : 4
3737 },
3838 "16" : {
3939 "BLOCK_SIZE_K" : 128 ,
40- "BLOCK_SIZE_M" : 32 ,
41- "BLOCK_SIZE_N" : 128 ,
42- "GROUP_SIZE_M" : 16 ,
40+ "BLOCK_SIZE_M" : 16 ,
41+ "BLOCK_SIZE_N" : 64 ,
42+ "GROUP_SIZE_M" : 1 ,
4343 "NEED_TRANS" : true ,
44- "num_stages" : 3 ,
45- "num_warps" : 4
46- },
47- "16384" : {
48- "BLOCK_SIZE_K" : 128 ,
49- "BLOCK_SIZE_M" : 64 ,
50- "BLOCK_SIZE_N" : 128 ,
51- "GROUP_SIZE_M" : 16 ,
52- "NEED_TRANS" : false ,
53- "num_stages" : 4 ,
44+ "num_stages" : 5 ,
5445 "num_warps" : 4
5546 },
5647 "2048" : {
6455 },
6556 "256" : {
6657 "BLOCK_SIZE_K" : 128 ,
67- "BLOCK_SIZE_M" : 32 ,
58+ "BLOCK_SIZE_M" : 16 ,
6859 "BLOCK_SIZE_N" : 128 ,
69- "GROUP_SIZE_M" : 16 ,
60+ "GROUP_SIZE_M" : 1 ,
7061 "NEED_TRANS" : true ,
7162 "num_stages" : 4 ,
7263 "num_warps" : 4
7364 },
7465 "32" : {
7566 "BLOCK_SIZE_K" : 128 ,
76- "BLOCK_SIZE_M" : 16 ,
67+ "BLOCK_SIZE_M" : 32 ,
7768 "BLOCK_SIZE_N" : 64 ,
7869 "GROUP_SIZE_M" : 16 ,
7970 "NEED_TRANS" : true ,
80- "num_stages" : 4 ,
81- "num_warps" : 4
82- },
83- "4" : {
84- "BLOCK_SIZE_K" : 128 ,
85- "BLOCK_SIZE_M" : 16 ,
86- "BLOCK_SIZE_N" : 64 ,
87- "GROUP_SIZE_M" : 32 ,
88- "NEED_TRANS" : true ,
89- "num_stages" : 5 ,
71+ "num_stages" : 3 ,
9072 "num_warps" : 4
9173 },
9274 "4096" : {
10284 "BLOCK_SIZE_K" : 128 ,
10385 "BLOCK_SIZE_M" : 16 ,
10486 "BLOCK_SIZE_N" : 64 ,
105- "GROUP_SIZE_M" : 1 ,
87+ "GROUP_SIZE_M" : 16 ,
10688 "NEED_TRANS" : true ,
10789 "num_stages" : 3 ,
10890 "num_warps" : 4
10991 },
11092 "8" : {
11193 "BLOCK_SIZE_K" : 128 ,
112- "BLOCK_SIZE_M" : 32 ,
94+ "BLOCK_SIZE_M" : 16 ,
11395 "BLOCK_SIZE_N" : 64 ,
114- "GROUP_SIZE_M" : 1 ,
96+ "GROUP_SIZE_M" : 16 ,
11597 "NEED_TRANS" : true ,
98+ "num_stages" : 3 ,
99+ "num_warps" : 4
100+ },
101+ "8448" : {
102+ "BLOCK_SIZE_K" : 128 ,
103+ "BLOCK_SIZE_M" : 64 ,
104+ "BLOCK_SIZE_N" : 128 ,
105+ "GROUP_SIZE_M" : 16 ,
106+ "NEED_TRANS" : false ,
116107 "num_stages" : 4 ,
117108 "num_warps" : 4
118109 }
You can’t perform that action at this time.
0 commit comments