Skip to content

Commit 34696f9

Browse files
committed
Limit the threads in max_parallelism
1 parent a1aa557 commit 34696f9

File tree

5 files changed

+43
-43
lines changed

5 files changed

+43
-43
lines changed

apps/bgu/CMakeLists.txt

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
1414
# Generator
1515
add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
1616

17+
set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
18+
19+
if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
20+
# When target=host-cuda or host-metal, set last_level_cache per GPU block
21+
# eliminates all `.compute_at` in the generated schedules, which eliminates
22+
# all GPU shared memory allocations.
23+
list(APPEND _bgu_autoscheduler_params
24+
autoscheduler.last_level_cache_size=2000
25+
)
26+
endif()
27+
1728
# Filters
1829
add_halide_library(bgu FROM bgu.generator)
1930
add_halide_library(bgu_auto_schedule FROM bgu.generator
2031
GENERATOR bgu
2132
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.experimental_gpu_schedule=1)
33+
PARAMS ${_bgu_autoscheduler})
2334

2435
# Main executable
2536
add_executable(bgu_filter filter.cpp)

apps/lens_blur/CMakeLists.txt

Lines changed: 8 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
1919
add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
2020
GENERATOR lens_blur
2121
AUTOSCHEDULER Halide::Mullapudi2016
22-
PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
22+
PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
2323

2424
# Main executable
2525
add_executable(lens_blur_filter process.cpp)
@@ -32,26 +32,11 @@ target_link_libraries(lens_blur_filter
3232
# Test that the app actually works!
3333
set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png)
3434
if (EXISTS ${IMAGE})
35-
if (Halide_TARGET MATCHES "metal")
36-
# Note(antonysigma): Buildbot error message:
37-
#
38-
# 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation
39-
# Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
40-
# failed assertion `(threadsPerThreadgroup.width(32) *
41-
# threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must
42-
# be <= 896. (kernel threadgroup size limit)'
43-
#
44-
# Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is
45-
# hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836
46-
# threads per block because of the register pressure in lens_blur's GPU kernel.
47-
message ("Pipeline lens_blur_auto_schedule skipped for target host-metal")
48-
else ()
49-
configure_file(${IMAGE} rgb_small.png COPYONLY)
50-
add_test(NAME lens_blur_filter
51-
COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
52-
set_tests_properties(lens_blur_filter PROPERTIES
53-
LABELS lens_blur
54-
PASS_REGULAR_EXPRESSION "Success!"
55-
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
56-
endif ()
35+
configure_file(${IMAGE} rgb_small.png COPYONLY)
36+
add_test(NAME lens_blur_filter
37+
COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
38+
set_tests_properties(lens_blur_filter PROPERTIES
39+
LABELS lens_blur
40+
PASS_REGULAR_EXPRESSION "Success!"
41+
SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
5742
endif ()

apps/local_laplacian/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
2323
AUTOSCHEDULER Halide::Mullapudi2016
2424
# When target=host-cuda or host-metal, limit the GPU shared
2525
# memory per block to avoid gpu kernel launch failure.
26-
PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1
26+
PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.experimental_gpu_schedule=1
2727
)
2828

2929
# Main executable

apps/stencil_chain/CMakeLists.txt

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,14 +14,23 @@ find_package(Halide REQUIRED)
1414
# Generator
1515
add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp)
1616

17+
set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
18+
19+
if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
20+
# When target=host-cuda or host-metal, set last_level_cache per GPU block
21+
# eliminates all `.compute_at` in the generated schedules, which eliminates
22+
# all GPU shared memory allocations.
23+
list(APPEND _stencil_chain_autoscheduler_params
24+
autoscheduler.last_level_cache_size=2000
25+
)
26+
endif()
27+
1728
# Filters
1829
add_halide_library(stencil_chain FROM stencil_chain.generator)
1930
add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
2031
GENERATOR stencil_chain
2132
AUTOSCHEDULER Halide::Mullapudi2016
22-
# When target=host-cuda or host-metal, limit the GPU shared
23-
# memory per block to avoid gpu kernel launch failure.
24-
PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1
33+
PARAMS ${_stenctil_chain_autoscheduler_params}
2534
)
2635

2736
# Main executable

src/autoschedulers/mullapudi2016/AutoSchedule.cpp

Lines changed: 10 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1368,7 +1368,7 @@ class GPUTilingDedup {
13681368
}
13691369

13701370
/** Generate Halide GPU schedules. */
1371-
void apply(AutoSchedule &sched) {
1371+
void apply(AutoSchedule &sched, const Expr &parallelism) {
13721372
if (!ordering.empty() && !is_initial_order) {
13731373
std::set<std::string> var_list;
13741374
for (const auto &v : ordering) {
@@ -1396,7 +1396,7 @@ class GPUTilingDedup {
13961396
}
13971397

13981398
GPUTileHelper helper{f, stage_num};
1399-
Expr threads_budget = max_n_threads;
1399+
Expr threads_budget = min(parallelism, max_n_threads);
14001400

14011401
// Maximize GPU thread occupancy with the grid-stride loop.
14021402
//
@@ -1423,22 +1423,16 @@ class GPUTilingDedup {
14231423

14241424
const auto &[var, entry] = *iter;
14251425

1426-
const bool should_unroll = can_prove(entry.factor <= 1);
1427-
if (should_unroll) {
1428-
// Skip thread size of 1.
1429-
continue;
1430-
}
1431-
14321426
split_info new_entry{entry};
1433-
new_entry.factor = 1;
1427+
new_entry.factor = simplify(min(threads_budget, entry.factor));
14341428

14351429
const bool can_split = helper.try_split(new_entry);
14361430
if (!can_split) {
14371431
// If more than 3 gpu_blocks are defined, mark the current loop as the for-loop.
14381432
parallelize.erase(iter);
14391433
continue;
14401434
}
1441-
threads_budget = simplify(max(threads_budget / entry.factor, 1));
1435+
threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
14421436
}
14431437

14441438
helper.commit(sched, is_compute_at);
@@ -2210,7 +2204,7 @@ Partitioner::find_best_tile_config(const Group &g) {
22102204
Group no_tile = g;
22112205
no_tile.tile_sizes = no_tile_config;
22122206

2213-
bool show_analysis = false;
2207+
constexpr bool show_analysis = false;
22142208
GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis);
22152209

22162210
GroupAnalysis best_analysis = no_tile_analysis;
@@ -2233,7 +2227,7 @@ Partitioner::find_best_tile_config(const Group &g) {
22332227
Expr benefit = estimate_benefit(best_analysis, new_analysis,
22342228
no_redundant_work, true);
22352229

2236-
if (show_analysis) {
2230+
if constexpr (show_analysis) {
22372231
debug(0) << "Benefit relative to not tiling:" << benefit << "\n";
22382232
debug(0) << "Best analysis:" << new_analysis;
22392233
debug(0) << "No tile analysis:" << no_tile_analysis;
@@ -3439,7 +3433,8 @@ void Partitioner::generate_group_cpu_schedule(
34393433
}
34403434
}
34413435
if (arch_params.is_gpu_schedule) {
3442-
auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
3436+
const Expr gpu_threads = simplify(min(iter->second, arch_params.parallelism / def_par));
3437+
auto parallelized_split = gpu_tiling.can_parallelize(v, gpu_threads);
34433438
if (parallelized_split) {
34443439
auto split_vars = *parallelized_split;
34453440
inner_dims.emplace_back(split_vars.inner);
@@ -3463,7 +3458,7 @@ void Partitioner::generate_group_cpu_schedule(
34633458
}
34643459

34653460
if (arch_params.is_gpu_schedule) {
3466-
gpu_tiling.apply(sched);
3461+
gpu_tiling.apply(sched, arch_params.parallelism);
34673462
}
34683463

34693464
// Find the level at which group members will be computed.
@@ -3552,7 +3547,7 @@ void Partitioner::generate_group_cpu_schedule(
35523547
mem_rvars, mem_estimates, sched, gpu_tiling2);
35533548

35543549
if (arch_params.is_gpu_schedule) {
3555-
gpu_tiling2.apply(sched);
3550+
gpu_tiling2.apply(sched, arch_params.parallelism);
35563551
}
35573552
}
35583553
}

0 commit comments

Comments
 (0)