Skip to content

Commit e46ac2c

Browse files
committed
Replace or emplace outer dimensions for GPU schedules
In the nested parallelism scheduling algorithm, whenever the dimension is marked for GPU acceleration, e.g. `y -> y_i, y_o`, replace the corresponding variable `y` with `y_o` in `outer_dims`. This ensures the internal assertion `dims.size() >= outer_dims.size()` is always true for GPU schedules. The immediate effect is that for a downstream stage having GPU schedules: `g.gpu_tile(x, xi, xo, ...)`, the upstream stage correctly specifies the dimension `xo` by `f.compute_at(g, xo)`. This is in accordance to the original design intent of the Mullapudi2016 paper. As a result, the GPU IR correctly synthesizes shared GPU memory to cache the intermediate results of stage `f`, optimizing for caching. --- Also, for all stages at are `computed_at`, mark all vectorizable inner dimensions as `gpu_threads`. --- In the correctness tests at `test/autoscheduler/mullapudi/*.cpp` and performance regression tests at `apps/*`, down adjust the estimated GPU shared memory limit by specifying `autoscheduler.last_level_cache_size <= 10000`. Except for pipline `conv_layer`, all pipelines should observe an improvement of caching.
1 parent 990eb5f commit e46ac2c

File tree

10 files changed

+49
-44
lines changed

10 files changed

+49
-44
lines changed

apps/bgu/CMakeLists.txt

Lines changed: 1 addition & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -19,16 +19,7 @@ add_halide_library(bgu FROM bgu.generator)
1919
add_halide_library(bgu_auto_schedule FROM bgu.generator
2020
GENERATOR bgu
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker
23-
# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error:
24-
#
25-
# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed
26-
#
27-
# Curiously, it works on a low-end GPU: Nvidia GTX 1660S.
28-
#
29-
# Uncomment the following code to debug. PARAMS
30-
# autoscheduler.experimental_gpu_schedule=1
31-
)
22+
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
3223

3324
# Main executable
3425
add_executable(bgu_filter filter.cpp)

apps/camera_pipe/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ add_halide_generator(camera_pipe.generator
2020
add_halide_library(camera_pipe FROM camera_pipe.generator)
2121
add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
2222
GENERATOR camera_pipe
23-
AUTOSCHEDULER Halide::Mullapudi2016)
23+
AUTOSCHEDULER Halide::Mullapudi2016
24+
PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
2425

2526
# Main executable
2627
add_executable(camera_pipe_process process.cpp)

apps/harris/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(harris FROM harris.generator)
1919
add_halide_library(harris_auto_schedule FROM harris.generator
2020
GENERATOR harris
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(harris_filter filter.cpp)

apps/iir_blur/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(iir_blur FROM iir_blur.generator)
1919
add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
2020
GENERATOR iir_blur
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(iir_blur_filter filter.cpp)

apps/lens_blur/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
1919
add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
2020
GENERATOR lens_blur
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(lens_blur_filter process.cpp)

apps/stencil_chain/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,11 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp
1818
add_halide_library(stencil_chain FROM stencil_chain.generator)
1919
add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
2020
GENERATOR stencil_chain
21-
AUTOSCHEDULER Halide::Mullapudi2016)
21+
AUTOSCHEDULER Halide::Mullapudi2016
22+
# When target=host-cuda or host-metal, limit the GPU shared
23+
# memory per block to avoid gpu kernel launch failure.
24+
PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1
25+
)
2226

2327
# Main executable
2428
add_executable(stencil_chain_process process.cpp)

apps/unsharp/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,8 @@ add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp)
1818
add_halide_library(unsharp FROM unsharp.generator)
1919
add_halide_library(unsharp_auto_schedule FROM unsharp.generator
2020
GENERATOR unsharp
21-
AUTOSCHEDULER Halide::Mullapudi2016)
21+
AUTOSCHEDULER Halide::Mullapudi2016
22+
PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
2223

2324
# Main executable
2425
add_executable(unsharp_filter filter.cpp)

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,21 @@ string get_sanitized_name(string name) {
123123
return name;
124124
}
125125

126+
// Similar to std::replace, but assuming the vector contains unique values. And
127+
// if the element is absent, append new value to the end of vector.
128+
void replace_or_emplace(std::vector<VarOrRVar> &dims_, const VarOrRVar &before, VarOrRVar after) {
129+
auto iter = std::find_if(dims_.begin(), dims_.end(),
130+
[before_name = before.name()](const VarOrRVar &d) {
131+
return d.name() == before_name;
132+
});
133+
const bool is_found = (iter != dims_.end());
134+
if (is_found) {
135+
*iter = std::move(after);
136+
} else {
137+
dims_.emplace_back(std::move(after));
138+
}
139+
}
140+
126141
// Representation of a function stage in the pipeline.
127142
struct FStage {
128143
Function func;
@@ -1426,8 +1441,11 @@ class GPUTilingDedup {
14261441
threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
14271442
}
14281443

1429-
if (!is_already_split) {
1430-
helper.commit(sched, is_compute_at);
1444+
helper.commit(sched, is_compute_at);
1445+
if (is_compute_at) {
1446+
// There are dimensions that does not need splitting but marked as
1447+
// vectorizable. Mark them as gpu threads.
1448+
mark_gpu_threads(sched);
14311449
}
14321450

14331451
// After calling `gpu_tiles` from `GPUTileHelper::commit()`, a few of
@@ -3425,7 +3443,8 @@ void Partitioner::generate_group_cpu_schedule(
34253443
if (parallelized_split) {
34263444
auto split_vars = *parallelized_split;
34273445
inner_dims.emplace_back(split_vars.inner);
3428-
outer_dims.emplace_back(split_vars.outer);
3446+
3447+
replace_or_emplace(outer_dims, v, split_vars.outer);
34293448
}
34303449
} else {
34313450
f_handle.parallel(v);

test/autoschedulers/mullapudi2016/large_window.cpp

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,13 @@ int main(int argc, char **argv) {
4747
Target target = get_jit_target_from_environment();
4848
Pipeline p(g);
4949

50-
p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
50+
constexpr Mullapudi2016TestParams gpu_specifications{
51+
/* .last_level_cache_size = */ 35'000,
52+
/* .parallelism = */ 128,
53+
};
54+
55+
p.apply_autoscheduler(target,
56+
get_mullapudi2016_test_params(target.has_gpu_feature(), {gpu_specifications}));
5157

5258
// Inspect the schedule (only for debugging))
5359
// g.print_loop_nest();

test/autoschedulers/mullapudi2016/reorder.cpp

Lines changed: 7 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -82,21 +82,11 @@ double run_test_2(bool auto_schedule) {
8282
// Provide estimates on the pipeline output
8383
diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}});
8484

85-
// Auto-schedule the pipeline
86-
//
87-
// Increasing the GPU's active warp count estimate (aka parallelism)
88-
// from 128 to 2048 to disable the Autoscheduler's grid-stride loop
89-
// feature. At small parallelism value, the autoscheduler correctly
90-
// designates dimension 'z' as the stride axis in the GPU grid-stride
91-
// loop, which improves thread occupancy. However, it fails to reorder
92-
// 'z' inside the gpu_blocks 'xo' and 'yo', which is required for proper
93-
// loop nesting and successful code generation.
94-
//
95-
// Reference:
96-
// https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
85+
// note(antonysigma): Reducing the GPU's shared memory size estimate so that the GPU kernel
86+
// can launch on consumer-grade GPUs.
9787
constexpr Mullapudi2016TestParams gpu_specifications{
98-
/* .last_level_cache_size = */ 47'000,
99-
/* .parallelism = */ 2048,
88+
/* .last_level_cache_size = */ 20'000,
89+
/* .parallelism = */ 128,
10090
};
10191

10292
p.apply_autoscheduler(
@@ -139,16 +129,9 @@ double run_test_3(bool auto_schedule) {
139129
if (auto_schedule) {
140130
// Provide estimates on the pipeline output
141131
r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
142-
// Auto-schedule the pipeline
143-
//
144-
// Disabling this experimental GPU feature because the autoscheduler correctly
145-
// identifies reduction domain 'r.x' as the stride axis for the GPU grid-stride loop,
146-
// which helps retain threads efficiently. However, it fails to reorder 'r.x'
147-
// inside the loop nests of gpu_blocks 'xo' and 'yo', which is necessary for
148-
// successful code generation.
149-
//
150-
// Reference: https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
151-
p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
132+
133+
p.apply_autoscheduler(target,
134+
get_mullapudi2016_test_params(target.has_gpu_feature()));
152135
} else {
153136
Var par("par");
154137
r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);

0 commit comments

Comments
 (0)