Skip to content

Commit 7a4518b

Browse files
committed
[TRTLLM-8511][feat] Dynamically select a tile size for fused_mlp_moe_kernel
For the triton fused_moe_kernel, search for a device-specific (skew) tile size configuration using the batch size as key. Each device has it's own configuration file in JSON format. If the config file is not found then we revert to the default tile size configuration. Signed-off-by: Neta Zmora <[email protected]>
1 parent 3a5845e commit 7a4518b

File tree

203 files changed

+30149
-3
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

203 files changed

+30149
-3
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 32,
5+
"BLOCK_SIZE_K": 128,
6+
"GROUP_SIZE_M": 1,
7+
"num_warps": 4,
8+
"num_stages": 4
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 16,
12+
"BLOCK_SIZE_N": 32,
13+
"BLOCK_SIZE_K": 128,
14+
"GROUP_SIZE_M": 64,
15+
"num_warps": 4,
16+
"num_stages": 3
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 16,
20+
"BLOCK_SIZE_N": 32,
21+
"BLOCK_SIZE_K": 128,
22+
"GROUP_SIZE_M": 1,
23+
"num_warps": 4,
24+
"num_stages": 4
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 16,
28+
"BLOCK_SIZE_N": 32,
29+
"BLOCK_SIZE_K": 256,
30+
"GROUP_SIZE_M": 32,
31+
"num_warps": 4,
32+
"num_stages": 3
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 16,
36+
"BLOCK_SIZE_N": 32,
37+
"BLOCK_SIZE_K": 128,
38+
"GROUP_SIZE_M": 1,
39+
"num_warps": 4,
40+
"num_stages": 4
41+
},
42+
"24": {
43+
"BLOCK_SIZE_M": 16,
44+
"BLOCK_SIZE_N": 64,
45+
"BLOCK_SIZE_K": 64,
46+
"GROUP_SIZE_M": 16,
47+
"num_warps": 4,
48+
"num_stages": 5
49+
},
50+
"32": {
51+
"BLOCK_SIZE_M": 16,
52+
"BLOCK_SIZE_N": 32,
53+
"BLOCK_SIZE_K": 256,
54+
"GROUP_SIZE_M": 1,
55+
"num_warps": 4,
56+
"num_stages": 2
57+
},
58+
"48": {
59+
"BLOCK_SIZE_M": 64,
60+
"BLOCK_SIZE_N": 64,
61+
"BLOCK_SIZE_K": 64,
62+
"GROUP_SIZE_M": 1,
63+
"num_warps": 4,
64+
"num_stages": 3
65+
},
66+
"64": {
67+
"BLOCK_SIZE_M": 64,
68+
"BLOCK_SIZE_N": 64,
69+
"BLOCK_SIZE_K": 64,
70+
"GROUP_SIZE_M": 1,
71+
"num_warps": 4,
72+
"num_stages": 3
73+
},
74+
"96": {
75+
"BLOCK_SIZE_M": 32,
76+
"BLOCK_SIZE_N": 128,
77+
"BLOCK_SIZE_K": 128,
78+
"GROUP_SIZE_M": 1,
79+
"num_warps": 4,
80+
"num_stages": 3
81+
},
82+
"128": {
83+
"BLOCK_SIZE_M": 64,
84+
"BLOCK_SIZE_N": 64,
85+
"BLOCK_SIZE_K": 64,
86+
"GROUP_SIZE_M": 16,
87+
"num_warps": 4,
88+
"num_stages": 3
89+
},
90+
"256": {
91+
"BLOCK_SIZE_M": 64,
92+
"BLOCK_SIZE_N": 64,
93+
"BLOCK_SIZE_K": 64,
94+
"GROUP_SIZE_M": 32,
95+
"num_warps": 4,
96+
"num_stages": 4
97+
},
98+
"512": {
99+
"BLOCK_SIZE_M": 64,
100+
"BLOCK_SIZE_N": 256,
101+
"BLOCK_SIZE_K": 64,
102+
"GROUP_SIZE_M": 32,
103+
"num_warps": 4,
104+
"num_stages": 4
105+
},
106+
"1024": {
107+
"BLOCK_SIZE_M": 64,
108+
"BLOCK_SIZE_N": 256,
109+
"BLOCK_SIZE_K": 64,
110+
"GROUP_SIZE_M": 64,
111+
"num_warps": 4,
112+
"num_stages": 4
113+
},
114+
"1536": {
115+
"BLOCK_SIZE_M": 64,
116+
"BLOCK_SIZE_N": 256,
117+
"BLOCK_SIZE_K": 64,
118+
"GROUP_SIZE_M": 64,
119+
"num_warps": 4,
120+
"num_stages": 4
121+
},
122+
"2048": {
123+
"BLOCK_SIZE_M": 64,
124+
"BLOCK_SIZE_N": 256,
125+
"BLOCK_SIZE_K": 64,
126+
"GROUP_SIZE_M": 32,
127+
"num_warps": 4,
128+
"num_stages": 4
129+
},
130+
"3072": {
131+
"BLOCK_SIZE_M": 64,
132+
"BLOCK_SIZE_N": 256,
133+
"BLOCK_SIZE_K": 64,
134+
"GROUP_SIZE_M": 32,
135+
"num_warps": 4,
136+
"num_stages": 4
137+
},
138+
"4096": {
139+
"BLOCK_SIZE_M": 64,
140+
"BLOCK_SIZE_N": 256,
141+
"BLOCK_SIZE_K": 64,
142+
"GROUP_SIZE_M": 16,
143+
"num_warps": 4,
144+
"num_stages": 4
145+
}
146+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,146 @@
1+
{
2+
"1": {
3+
"BLOCK_SIZE_M": 16,
4+
"BLOCK_SIZE_N": 64,
5+
"BLOCK_SIZE_K": 256,
6+
"GROUP_SIZE_M": 16,
7+
"num_warps": 4,
8+
"num_stages": 4
9+
},
10+
"2": {
11+
"BLOCK_SIZE_M": 16,
12+
"BLOCK_SIZE_N": 64,
13+
"BLOCK_SIZE_K": 256,
14+
"GROUP_SIZE_M": 32,
15+
"num_warps": 4,
16+
"num_stages": 4
17+
},
18+
"4": {
19+
"BLOCK_SIZE_M": 16,
20+
"BLOCK_SIZE_N": 64,
21+
"BLOCK_SIZE_K": 256,
22+
"GROUP_SIZE_M": 16,
23+
"num_warps": 4,
24+
"num_stages": 4
25+
},
26+
"8": {
27+
"BLOCK_SIZE_M": 16,
28+
"BLOCK_SIZE_N": 64,
29+
"BLOCK_SIZE_K": 256,
30+
"GROUP_SIZE_M": 32,
31+
"num_warps": 4,
32+
"num_stages": 4
33+
},
34+
"16": {
35+
"BLOCK_SIZE_M": 16,
36+
"BLOCK_SIZE_N": 64,
37+
"BLOCK_SIZE_K": 256,
38+
"GROUP_SIZE_M": 64,
39+
"num_warps": 4,
40+
"num_stages": 4
41+
},
42+
"24": {
43+
"BLOCK_SIZE_M": 32,
44+
"BLOCK_SIZE_N": 64,
45+
"BLOCK_SIZE_K": 128,
46+
"GROUP_SIZE_M": 16,
47+
"num_warps": 4,
48+
"num_stages": 5
49+
},
50+
"32": {
51+
"BLOCK_SIZE_M": 32,
52+
"BLOCK_SIZE_N": 64,
53+
"BLOCK_SIZE_K": 128,
54+
"GROUP_SIZE_M": 1,
55+
"num_warps": 4,
56+
"num_stages": 5
57+
},
58+
"48": {
59+
"BLOCK_SIZE_M": 32,
60+
"BLOCK_SIZE_N": 64,
61+
"BLOCK_SIZE_K": 64,
62+
"GROUP_SIZE_M": 16,
63+
"num_warps": 4,
64+
"num_stages": 5
65+
},
66+
"64": {
67+
"BLOCK_SIZE_M": 64,
68+
"BLOCK_SIZE_N": 64,
69+
"BLOCK_SIZE_K": 128,
70+
"GROUP_SIZE_M": 1,
71+
"num_warps": 4,
72+
"num_stages": 4
73+
},
74+
"96": {
75+
"BLOCK_SIZE_M": 32,
76+
"BLOCK_SIZE_N": 256,
77+
"BLOCK_SIZE_K": 64,
78+
"GROUP_SIZE_M": 16,
79+
"num_warps": 4,
80+
"num_stages": 5
81+
},
82+
"128": {
83+
"BLOCK_SIZE_M": 64,
84+
"BLOCK_SIZE_N": 128,
85+
"BLOCK_SIZE_K": 64,
86+
"GROUP_SIZE_M": 64,
87+
"num_warps": 4,
88+
"num_stages": 4
89+
},
90+
"256": {
91+
"BLOCK_SIZE_M": 128,
92+
"BLOCK_SIZE_N": 128,
93+
"BLOCK_SIZE_K": 64,
94+
"GROUP_SIZE_M": 16,
95+
"num_warps": 8,
96+
"num_stages": 4
97+
},
98+
"512": {
99+
"BLOCK_SIZE_M": 128,
100+
"BLOCK_SIZE_N": 128,
101+
"BLOCK_SIZE_K": 64,
102+
"GROUP_SIZE_M": 64,
103+
"num_warps": 4,
104+
"num_stages": 3
105+
},
106+
"1024": {
107+
"BLOCK_SIZE_M": 128,
108+
"BLOCK_SIZE_N": 128,
109+
"BLOCK_SIZE_K": 64,
110+
"GROUP_SIZE_M": 64,
111+
"num_warps": 4,
112+
"num_stages": 3
113+
},
114+
"1536": {
115+
"BLOCK_SIZE_M": 128,
116+
"BLOCK_SIZE_N": 128,
117+
"BLOCK_SIZE_K": 64,
118+
"GROUP_SIZE_M": 16,
119+
"num_warps": 4,
120+
"num_stages": 3
121+
},
122+
"2048": {
123+
"BLOCK_SIZE_M": 128,
124+
"BLOCK_SIZE_N": 128,
125+
"BLOCK_SIZE_K": 64,
126+
"GROUP_SIZE_M": 32,
127+
"num_warps": 4,
128+
"num_stages": 3
129+
},
130+
"3072": {
131+
"BLOCK_SIZE_M": 128,
132+
"BLOCK_SIZE_N": 128,
133+
"BLOCK_SIZE_K": 64,
134+
"GROUP_SIZE_M": 16,
135+
"num_warps": 4,
136+
"num_stages": 3
137+
},
138+
"4096": {
139+
"BLOCK_SIZE_M": 128,
140+
"BLOCK_SIZE_N": 128,
141+
"BLOCK_SIZE_K": 64,
142+
"GROUP_SIZE_M": 16,
143+
"num_warps": 4,
144+
"num_stages": 3
145+
}
146+
}

0 commit comments

Comments
 (0)