From e46ac2c9e64fa83f0e8318c5e82a471794b1834e Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Thu, 10 Jul 2025 11:59:56 -0700
Subject: [PATCH 01/12] Replace or emplace outer dimensions for GPU schedules

In the nested parallelism scheduling algorithm, whenever the dimension
is marked for GPU acceleration, e.g. `y -> y_i, y_o`, replace the
corresponding variable `y` with `y_o` in `outer_dims`.

This ensures the internal assertion `dims.size() >= outer_dims.size()`
is always true for GPU schedules.

The immediate effect is that for a downstream stage having GPU
schedules: `g.gpu_tile(x, xi, xo, ...)`, the upstream stage correctly
specifies the dimension `xo` by `f.compute_at(g, xo)`. This is in
accordance to the original design intent of the Mullapudi2016 paper.

As a result, the GPU IR correctly synthesizes shared GPU memory to cache
the intermediate results of stage `f`, optimizing for caching.

---

Also, for all stages at are `computed_at`, mark all vectorizable inner
dimensions as `gpu_threads`.

---

In the correctness tests at `test/autoscheduler/mullapudi/*.cpp` and
performance regression tests at `apps/*`, down adjust the estimated GPU
shared memory limit by specifying `autoscheduler.last_level_cache_size
<= 10000`. Except for pipline `conv_layer`, all pipelines should observe
an improvement of caching.
---
 apps/bgu/CMakeLists.txt                       | 11 +------
 apps/camera_pipe/CMakeLists.txt               |  3 +-
 apps/harris/CMakeLists.txt                    |  2 +-
 apps/iir_blur/CMakeLists.txt                  |  2 +-
 apps/lens_blur/CMakeLists.txt                 |  2 +-
 apps/stencil_chain/CMakeLists.txt             |  6 +++-
 apps/unsharp/CMakeLists.txt                   |  3 +-
 .../mullapudi2016/AutoSchedule.cpp            | 25 +++++++++++++--
 .../mullapudi2016/large_window.cpp            |  8 ++++-
 test/autoschedulers/mullapudi2016/reorder.cpp | 31 +++++--------------
 10 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index d7a44a2cda9b..e8314fed4238 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -19,16 +19,7 @@ add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker
-# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error:
-#
-# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed
-#
-# Curiously, it works on a low-end GPU: Nvidia GTX 1660S.
-#
-# Uncomment the following code to debug. PARAMS
-# autoscheduler.experimental_gpu_schedule=1
-)
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
index 94bf7a1ae447..cd5a207e3086 100644
--- a/apps/camera_pipe/CMakeLists.txt
+++ b/apps/camera_pipe/CMakeLists.txt
@@ -20,7 +20,8 @@ add_halide_generator(camera_pipe.generator
 add_halide_library(camera_pipe FROM camera_pipe.generator)
 add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
                    GENERATOR camera_pipe
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(camera_pipe_process process.cpp)
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
index b6f95e554383..5657b4ff471d 100644
--- a/apps/harris/CMakeLists.txt
+++ b/apps/harris/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
index 0ca3233968f3..95d66e864376 100644
--- a/apps/iir_blur/CMakeLists.txt
+++ b/apps/iir_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(iir_blur FROM iir_blur.generator)
 add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
                    GENERATOR iir_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(iir_blur_filter filter.cpp)
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index 46be29788591..183634bae137 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
index 2a64a719209f..c26fc13b9deb 100644
--- a/apps/stencil_chain/CMakeLists.txt
+++ b/apps/stencil_chain/CMakeLists.txt
@@ -18,7 +18,11 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1
+                   )
 
 # Main executable
 add_executable(stencil_chain_process process.cpp)
diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt
index 7153dfbf6a4a..79434ec5f055 100644
--- a/apps/unsharp/CMakeLists.txt
+++ b/apps/unsharp/CMakeLists.txt
@@ -18,7 +18,8 @@ add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp)
 add_halide_library(unsharp FROM unsharp.generator)
 add_halide_library(unsharp_auto_schedule FROM unsharp.generator
                    GENERATOR unsharp
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(unsharp_filter filter.cpp)
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 1910d90f04cf..718f474420e7 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -123,6 +123,21 @@ string get_sanitized_name(string name) {
     return name;
 }
 
+// Similar to std::replace, but assuming the vector contains unique values. And
+// if the element is absent, append new value to the end of vector.
+void replace_or_emplace(std::vector<VarOrRVar> &dims_, const VarOrRVar &before, VarOrRVar after) {
+    auto iter = std::find_if(dims_.begin(), dims_.end(),
+                             [before_name = before.name()](const VarOrRVar &d) {
+                                 return d.name() == before_name;
+                             });
+    const bool is_found = (iter != dims_.end());
+    if (is_found) {
+        *iter = std::move(after);
+    } else {
+        dims_.emplace_back(std::move(after));
+    }
+}
+
 // Representation of a function stage in the pipeline.
 struct FStage {
     Function func;
@@ -1426,8 +1441,11 @@ class GPUTilingDedup {
             threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
         }
 
-        if (!is_already_split) {
-            helper.commit(sched, is_compute_at);
+        helper.commit(sched, is_compute_at);
+        if (is_compute_at) {
+            // There are dimensions that does not need splitting but marked as
+            // vectorizable. Mark them as gpu threads.
+            mark_gpu_threads(sched);
         }
 
         // After calling `gpu_tiles` from `GPUTileHelper::commit()`, a few of
@@ -3425,7 +3443,8 @@ void Partitioner::generate_group_cpu_schedule(
                     if (parallelized_split) {
                         auto split_vars = *parallelized_split;
                         inner_dims.emplace_back(split_vars.inner);
-                        outer_dims.emplace_back(split_vars.outer);
+
+                        replace_or_emplace(outer_dims, v, split_vars.outer);
                     }
                 } else {
                     f_handle.parallel(v);
diff --git a/test/autoschedulers/mullapudi2016/large_window.cpp b/test/autoschedulers/mullapudi2016/large_window.cpp
index fd2a1491c56b..c163d19f339c 100644
--- a/test/autoschedulers/mullapudi2016/large_window.cpp
+++ b/test/autoschedulers/mullapudi2016/large_window.cpp
@@ -47,7 +47,13 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
-    p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
+    constexpr Mullapudi2016TestParams gpu_specifications{
+        /* .last_level_cache_size = */ 35'000,
+        /* .parallelism = */ 128,
+    };
+
+    p.apply_autoscheduler(target,
+                          get_mullapudi2016_test_params(target.has_gpu_feature(), {gpu_specifications}));
 
     // Inspect the schedule (only for debugging))
     // g.print_loop_nest();
diff --git a/test/autoschedulers/mullapudi2016/reorder.cpp b/test/autoschedulers/mullapudi2016/reorder.cpp
index 18a3a332ebc6..39cce5264b65 100644
--- a/test/autoschedulers/mullapudi2016/reorder.cpp
+++ b/test/autoschedulers/mullapudi2016/reorder.cpp
@@ -82,21 +82,11 @@ double run_test_2(bool auto_schedule) {
         // Provide estimates on the pipeline output
         diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}});
 
-        // Auto-schedule the pipeline
-        //
-        // Increasing the GPU's active warp count estimate (aka parallelism)
-        // from 128 to 2048 to disable the Autoscheduler's grid-stride loop
-        // feature. At small parallelism value, the autoscheduler correctly
-        // designates dimension 'z' as the stride axis in the GPU grid-stride
-        // loop, which improves thread occupancy. However, it fails to reorder
-        // 'z' inside the gpu_blocks 'xo' and 'yo', which is required for proper
-        // loop nesting and successful code generation.
-        //
-        // Reference:
-        // https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+        // note(antonysigma): Reducing the GPU's shared memory size estimate so that the GPU kernel
+        // can launch on consumer-grade GPUs.
         constexpr Mullapudi2016TestParams gpu_specifications{
-            /* .last_level_cache_size = */ 47'000,
-            /* .parallelism = */ 2048,
+            /* .last_level_cache_size = */ 20'000,
+            /* .parallelism = */ 128,
         };
 
         p.apply_autoscheduler(
@@ -139,16 +129,9 @@ double run_test_3(bool auto_schedule) {
     if (auto_schedule) {
         // Provide estimates on the pipeline output
         r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
-        // Auto-schedule the pipeline
-        //
-        // Disabling this experimental GPU feature because the autoscheduler correctly
-        // identifies reduction domain 'r.x' as the stride axis for the GPU grid-stride loop,
-        // which helps retain threads efficiently. However, it fails to reorder 'r.x'
-        // inside the loop nests of gpu_blocks 'xo' and 'yo', which is necessary for
-        // successful code generation.
-        //
-        // Reference: https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
-        p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
+
+        p.apply_autoscheduler(target,
+                              get_mullapudi2016_test_params(target.has_gpu_feature()));
     } else {
         Var par("par");
         r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);

From a1aa557c16c29c606d7885eee7ba53da3cb5c7d5 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Thu, 10 Jul 2025 19:19:31 -0700
Subject: [PATCH 02/12] Parallelize by a factor of 1

CPU's threading model is distinct from GPU's thread group model: GPU
shared memory is not shared beyond one GPU thread group.

Whenever nested parallelism is enabled in the Mullapudi2016
auto-scheduler, always implement parallelizable loop dimensions as
`gpu_block`. This can be implemented by splitting the dimensions by a
factor 1: `f.split(z, zi, zo, 1)`.

This makes the autoscheduler's `last_level_cache` estimates per GPU warp
more robust against variations of the nested parallelism.

In the folder `*/apps/`, remove all manual override of
`last_level_cache_size`. Use the default estimate: 47kB per thread
group.
---
 apps/bgu/CMakeLists.txt                           | 2 +-
 apps/harris/CMakeLists.txt                        | 2 +-
 apps/iir_blur/CMakeLists.txt                      | 2 +-
 apps/lens_blur/CMakeLists.txt                     | 2 +-
 apps/stencil_chain/CMakeLists.txt                 | 2 +-
 src/autoschedulers/mullapudi2016/AutoSchedule.cpp | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index e8314fed4238..6aa6a5bb623a 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
index 5657b4ff471d..b6f95e554383 100644
--- a/apps/harris/CMakeLists.txt
+++ b/apps/harris/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
index 95d66e864376..0ca3233968f3 100644
--- a/apps/iir_blur/CMakeLists.txt
+++ b/apps/iir_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(iir_blur FROM iir_blur.generator)
 add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
                    GENERATOR iir_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(iir_blur_filter filter.cpp)
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index 183634bae137..46be29788591 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
index c26fc13b9deb..8ee3e531996d 100644
--- a/apps/stencil_chain/CMakeLists.txt
+++ b/apps/stencil_chain/CMakeLists.txt
@@ -21,7 +21,7 @@ add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    AUTOSCHEDULER Halide::Mullapudi2016
                    # When target=host-cuda or host-metal, limit the GPU shared
                    # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1
+                   PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1
                    )
 
 # Main executable
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index 718f474420e7..db1f7f9eb3b6 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1430,7 +1430,7 @@ class GPUTilingDedup {
             }
 
             split_info new_entry{entry};
-            new_entry.factor = simplify(min(threads_budget, new_entry.factor));
+            new_entry.factor = 1;
 
             const bool can_split = helper.try_split(new_entry);
             if (!can_split) {
@@ -1438,7 +1438,7 @@ class GPUTilingDedup {
                 parallelize.erase(iter);
                 continue;
             }
-            threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
+            threads_budget = simplify(max(threads_budget / entry.factor, 1));
         }
 
         helper.commit(sched, is_compute_at);

From 34696f917a9c4ebe87fdd19fb5011b9afac65e82 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Fri, 11 Jul 2025 10:26:38 -0700
Subject: [PATCH 03/12] Limit the threads in max_parallelism

---
 apps/bgu/CMakeLists.txt                       | 13 +++++++-
 apps/lens_blur/CMakeLists.txt                 | 31 +++++--------------
 apps/local_laplacian/CMakeLists.txt           |  2 +-
 apps/stencil_chain/CMakeLists.txt             | 15 +++++++--
 .../mullapudi2016/AutoSchedule.cpp            | 25 ++++++---------
 5 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index 6aa6a5bb623a..a7a366af10d4 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
 
+set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
+    # When target=host-cuda or host-metal, set last_level_cache per GPU block
+    # eliminates all `.compute_at` in the generated schedules, which eliminates
+    # all GPU shared memory allocations.
+    list(APPEND _bgu_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_bgu_autoscheduler})
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index 46be29788591..2c9dae845f71 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
@@ -32,26 +32,11 @@ target_link_libraries(lens_blur_filter
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb_small.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "metal")
-        # Note(antonysigma): Buildbot error message:
-        #
-        # 2025-06-30 23:26:02.260 lens_blur_filter[32272:21031150] Metal API Validation
-        # Enabled -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
-        # failed assertion `(threadsPerThreadgroup.width(32) *
-        # threadsPerThreadgroup.height(32) * threadsPerThreadgroup.depth(1))(1024) must
-        # be <= 896. (kernel threadgroup size limit)'
-        #
-        # Possible root cause: Autoscheduler's GPUTilingDedup::max_n_threads is
-        # hardcoded to 1024 threads per block. The OSX Metal API caps the value at 836
-        # threads per block because of the register pressure in lens_blur's GPU kernel.
-        message ("Pipeline lens_blur_auto_schedule skipped for target host-metal")
-    else ()
-        configure_file(${IMAGE} rgb_small.png COPYONLY)
-        add_test(NAME lens_blur_filter
-                COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
-        set_tests_properties(lens_blur_filter PROPERTIES
-                            LABELS lens_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb_small.png COPYONLY)
+    add_test(NAME lens_blur_filter
+            COMMAND lens_blur_filter rgb_small.png 32 13 0.5 32 3 out.png)
+    set_tests_properties(lens_blur_filter PROPERTIES
+                        LABELS lens_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 068060ad83b8..f84d26da59cf 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -23,7 +23,7 @@ add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    AUTOSCHEDULER Halide::Mullapudi2016
                    # When target=host-cuda or host-metal, limit the GPU shared
                    # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1
+                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.experimental_gpu_schedule=1
                    )
 
 # Main executable
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
index 8ee3e531996d..38d10d5da4ff 100644
--- a/apps/stencil_chain/CMakeLists.txt
+++ b/apps/stencil_chain/CMakeLists.txt
@@ -14,14 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp)
 
+set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
+    # When target=host-cuda or host-metal, set last_level_cache per GPU block
+    # eliminates all `.compute_at` in the generated schedules, which eliminates
+    # all GPU shared memory allocations.
+    list(APPEND _stencil_chain_autoscheduler_params
+        autoscheduler.last_level_cache_size=2000
+    )
+endif()
+
 # Filters
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   # When target=host-cuda or host-metal, limit the GPU shared
-                   # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=2000 autoscheduler.experimental_gpu_schedule=1
+                   PARAMS ${_stenctil_chain_autoscheduler_params}
                    )
 
 # Main executable
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
index db1f7f9eb3b6..2fd6a5b40ca3 100644
--- a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
+++ b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -1368,7 +1368,7 @@ class GPUTilingDedup {
     }
 
     /** Generate Halide GPU schedules. */
-    void apply(AutoSchedule &sched) {
+    void apply(AutoSchedule &sched, const Expr &parallelism) {
         if (!ordering.empty() && !is_initial_order) {
             std::set<std::string> var_list;
             for (const auto &v : ordering) {
@@ -1396,7 +1396,7 @@ class GPUTilingDedup {
         }
 
         GPUTileHelper helper{f, stage_num};
-        Expr threads_budget = max_n_threads;
+        Expr threads_budget = min(parallelism, max_n_threads);
 
         // Maximize GPU thread occupancy with the grid-stride loop.
         //
@@ -1423,14 +1423,8 @@ class GPUTilingDedup {
 
             const auto &[var, entry] = *iter;
 
-            const bool should_unroll = can_prove(entry.factor <= 1);
-            if (should_unroll) {
-                // Skip thread size of 1.
-                continue;
-            }
-
             split_info new_entry{entry};
-            new_entry.factor = 1;
+            new_entry.factor = simplify(min(threads_budget, entry.factor));
 
             const bool can_split = helper.try_split(new_entry);
             if (!can_split) {
@@ -1438,7 +1432,7 @@ class GPUTilingDedup {
                 parallelize.erase(iter);
                 continue;
             }
-            threads_budget = simplify(max(threads_budget / entry.factor, 1));
+            threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
         }
 
         helper.commit(sched, is_compute_at);
@@ -2210,7 +2204,7 @@ Partitioner::find_best_tile_config(const Group &g) {
     Group no_tile = g;
     no_tile.tile_sizes = no_tile_config;
 
-    bool show_analysis = false;
+    constexpr bool show_analysis = false;
     GroupAnalysis no_tile_analysis = analyze_group(no_tile, show_analysis);
 
     GroupAnalysis best_analysis = no_tile_analysis;
@@ -2233,7 +2227,7 @@ Partitioner::find_best_tile_config(const Group &g) {
         Expr benefit = estimate_benefit(best_analysis, new_analysis,
                                         no_redundant_work, true);
 
-        if (show_analysis) {
+        if constexpr (show_analysis) {
             debug(0) << "Benefit relative to not tiling:" << benefit << "\n";
             debug(0) << "Best analysis:" << new_analysis;
             debug(0) << "No tile analysis:" << no_tile_analysis;
@@ -3439,7 +3433,8 @@ void Partitioner::generate_group_cpu_schedule(
                     }
                 }
                 if (arch_params.is_gpu_schedule) {
-                    auto parallelized_split = gpu_tiling.can_parallelize(v, iter->second);
+                    const Expr gpu_threads = simplify(min(iter->second, arch_params.parallelism / def_par));
+                    auto parallelized_split = gpu_tiling.can_parallelize(v, gpu_threads);
                     if (parallelized_split) {
                         auto split_vars = *parallelized_split;
                         inner_dims.emplace_back(split_vars.inner);
@@ -3463,7 +3458,7 @@ void Partitioner::generate_group_cpu_schedule(
     }
 
     if (arch_params.is_gpu_schedule) {
-        gpu_tiling.apply(sched);
+        gpu_tiling.apply(sched, arch_params.parallelism);
     }
 
     // Find the level at which group members will be computed.
@@ -3552,7 +3547,7 @@ void Partitioner::generate_group_cpu_schedule(
                         mem_rvars, mem_estimates, sched, gpu_tiling2);
 
         if (arch_params.is_gpu_schedule) {
-            gpu_tiling2.apply(sched);
+            gpu_tiling2.apply(sched, arch_params.parallelism);
         }
     }
 }

From 30b29448d1007f38b287fdaaae57caab97b58dc1 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Tue, 15 Jul 2025 19:53:49 -0700
Subject: [PATCH 04/12] Suppress GPU shared memory size estimates for
 challengling algorithm pipelines

---
 apps/bgu/CMakeLists.txt             |  6 +++---
 apps/camera_pipe/CMakeLists.txt     | 19 ++++++++++++++++++-
 apps/harris/harris_generator.cpp    |  2 +-
 apps/lens_blur/CMakeLists.txt       | 19 ++++++++++++++++++-
 apps/local_laplacian/CMakeLists.txt | 20 ++++++++++++++++++--
 5 files changed, 58 insertions(+), 8 deletions(-)

diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index a7a366af10d4..d39bd7253359 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -17,9 +17,9 @@ add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
 set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
 if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
-    # When target=host-cuda or host-metal, set last_level_cache per GPU block
-    # eliminates all `.compute_at` in the generated schedules, which eliminates
-    # all GPU shared memory allocations.
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
     list(APPEND _bgu_autoscheduler_params
         autoscheduler.last_level_cache_size=2000
     )
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
index cd5a207e3086..0cf53304d1eb 100644
--- a/apps/camera_pipe/CMakeLists.txt
+++ b/apps/camera_pipe/CMakeLists.txt
@@ -16,12 +16,29 @@ add_halide_generator(camera_pipe.generator
                      SOURCES camera_pipe_generator.cpp
                      LINK_LIBRARIES Halide::Tools)
 
+set(_camera_pipe_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda|metal")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _camera_pipe_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(NOT Halide_TARGET MATCHES "opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _camera_pipe_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(camera_pipe FROM camera_pipe.generator)
 add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
                    GENERATOR camera_pipe
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_camera_pipe_autoscheduler_params})
 
 # Main executable
 add_executable(camera_pipe_process process.cpp)
diff --git a/apps/harris/harris_generator.cpp b/apps/harris/harris_generator.cpp
index 69cf8c05c68c..bd35b4ae25d4 100644
--- a/apps/harris/harris_generator.cpp
+++ b/apps/harris/harris_generator.cpp
@@ -66,7 +66,7 @@ class Harris : public Halide::Generator<Harris> {
             const int kHeight = 2560;
             input.dim(0).set_estimate(0, kWidth);
             input.dim(1).set_estimate(0, kHeight);
-            input.dim(2).set_estimate(0, 3);
+            input.dim(2).set_estimate(0, 4);
             output.dim(0).set_estimate(3, kWidth - 6);
             output.dim(1).set_estimate(3, kHeight - 6);
         }
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index 2c9dae845f71..0bbc96cd03dd 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -14,12 +14,29 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(lens_blur.generator SOURCES lens_blur_generator.cpp)
 
+set(_lens_blur_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda|metal")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _lens_blur_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(NOT Halide_TARGET MATCHES "opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _lens_blur_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_lens_blur_autoscheduler_params})
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index f84d26da59cf..3bd4b32d961f 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -16,6 +16,23 @@ add_halide_generator(local_laplacian.generator
                      SOURCES local_laplacian_generator.cpp
                      LINK_LIBRARIES Halide::Tools)
 
+set(_local_laplacian_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(NOT Halide_TARGET MATCHES "cuda")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _local_laplacian_autoscheduler_params
+        autoscheduler.last_level_cache_size=10000
+    )
+elseif(NOT Halide_TARGET MATCHES "metal|opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _local_laplacian_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(local_laplacian FROM local_laplacian.generator)
 add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
@@ -23,8 +40,7 @@ add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    AUTOSCHEDULER Halide::Mullapudi2016
                    # When target=host-cuda or host-metal, limit the GPU shared
                    # memory per block to avoid gpu kernel launch failure.
-                   PARAMS autoscheduler.last_level_cache_size=30000 autoscheduler.experimental_gpu_schedule=1
-                   )
+                   PARAMS ${_local_laplacian_autoscheduler_params})
 
 # Main executable
 add_executable(local_laplacian_process process.cpp)

From b3d2f7f7760a65d7f32b883f5a2c7b3badafb636 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Wed, 16 Jul 2025 14:55:24 -0700
Subject: [PATCH 05/12] Eliminate double negation of the conditionals

---
 apps/bgu/CMakeLists.txt             | 2 +-
 apps/camera_pipe/CMakeLists.txt     | 4 ++--
 apps/lens_blur/CMakeLists.txt       | 4 ++--
 apps/local_laplacian/CMakeLists.txt | 4 ++--
 apps/stencil_chain/CMakeLists.txt   | 8 ++++----
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
index d39bd7253359..a08c6b825454 100644
--- a/apps/bgu/CMakeLists.txt
+++ b/apps/bgu/CMakeLists.txt
@@ -16,7 +16,7 @@ add_halide_generator(bgu.generator SOURCES bgu_generator.cpp)
 
 set(_bgu_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
+if(Halide_TARGET MATCHES "cuda|metal|opencl")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
index 0cf53304d1eb..461d1857651d 100644
--- a/apps/camera_pipe/CMakeLists.txt
+++ b/apps/camera_pipe/CMakeLists.txt
@@ -18,13 +18,13 @@ add_halide_generator(camera_pipe.generator
 
 set(_camera_pipe_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(NOT Halide_TARGET MATCHES "cuda|metal")
+if(Halide_TARGET MATCHES "cuda|metal")
     # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
     # tuned to pass the Builbot tests.
     list(APPEND _camera_pipe_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(NOT Halide_TARGET MATCHES "opencl")
+elseif(Halide_TARGET MATCHES "opencl")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
index 0bbc96cd03dd..aa99e55be6fd 100644
--- a/apps/lens_blur/CMakeLists.txt
+++ b/apps/lens_blur/CMakeLists.txt
@@ -16,13 +16,13 @@ add_halide_generator(lens_blur.generator SOURCES lens_blur_generator.cpp)
 
 set(_lens_blur_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(NOT Halide_TARGET MATCHES "cuda|metal")
+if(Halide_TARGET MATCHES "cuda|metal")
     # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
     # tuned to pass the Builbot tests.
     list(APPEND _lens_blur_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(NOT Halide_TARGET MATCHES "opencl")
+elseif(Halide_TARGET MATCHES "opencl")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 3bd4b32d961f..97884be54fee 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -18,13 +18,13 @@ add_halide_generator(local_laplacian.generator
 
 set(_local_laplacian_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(NOT Halide_TARGET MATCHES "cuda")
+if(Halide_TARGET MATCHES "cuda")
     # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
     # tuned to pass the Builbot tests.
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(NOT Halide_TARGET MATCHES "metal|opencl")
+elseif(Halide_TARGET MATCHES "metal|opencl")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
index 38d10d5da4ff..dabef9b07159 100644
--- a/apps/stencil_chain/CMakeLists.txt
+++ b/apps/stencil_chain/CMakeLists.txt
@@ -16,10 +16,10 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp
 
 set(_stencil_chain_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(NOT Halide_TARGET MATCHES "cuda|metal|opencl")
-    # When target=host-cuda or host-metal, set last_level_cache per GPU block
-    # eliminates all `.compute_at` in the generated schedules, which eliminates
-    # all GPU shared memory allocations.
+if(Halide_TARGET MATCHES "cuda|metal|opencl")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
     list(APPEND _stencil_chain_autoscheduler_params
         autoscheduler.last_level_cache_size=2000
     )

From 56bff19aaff06c0d0c592f353b843107bd9b45aa Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Thu, 17 Jul 2025 10:54:13 -0700
Subject: [PATCH 06/12] Also estimate the last_level_cache size for Vulkan
 devices

---
 apps/camera_pipe/CMakeLists.txt     |  2 +-
 apps/harris/CMakeLists.txt          | 13 ++++++++++++-
 apps/local_laplacian/CMakeLists.txt |  8 ++++----
 3 files changed, 17 insertions(+), 6 deletions(-)

diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
index 461d1857651d..ef36ff938d4e 100644
--- a/apps/camera_pipe/CMakeLists.txt
+++ b/apps/camera_pipe/CMakeLists.txt
@@ -24,7 +24,7 @@ if(Halide_TARGET MATCHES "cuda|metal")
     list(APPEND _camera_pipe_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(Halide_TARGET MATCHES "opencl")
+elseif(Halide_TARGET MATCHES "opencl|vulkan")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
index b6f95e554383..2704fb448807 100644
--- a/apps/harris/CMakeLists.txt
+++ b/apps/harris/CMakeLists.txt
@@ -14,12 +14,23 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(harris.generator SOURCES harris_generator.cpp)
 
+set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "opencl|metal")
+    # Set last_level_cache per GPU block to an extremely small value. This
+    # eliminates all `.compute_at` in the generated schedules, which in turn
+    # eliminates all GPU shared memory allocations.
+    list(APPEND _harris_autoscheduler_params
+        autoscheduler.last_level_cache_size=1000
+    )
+endif()
+
 # Filters
 add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_harris_autoscheduler_params})
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 97884be54fee..2fa4f8dc4b51 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -24,12 +24,12 @@ if(Halide_TARGET MATCHES "cuda")
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(Halide_TARGET MATCHES "metal|opencl")
-    # Set last_level_cache per GPU block to an extremely small value. This
-    # eliminates all `.compute_at` in the generated schedules, which in turn
-    # eliminates all GPU shared memory allocations.
+elseif(Halide_TARGET MATCHES "metal|opencl|vulkan")
+    # The pipeline is shared GPU memory bounded. Limit the parallelism to
+    # minimal value (=32) to cap the shared GPU memory size.
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=1000
+        autoscheduler.parallelism=32
     )
 endif()
 

From 997f082237733ff1c9fcc7284e4aed4c39247cd2 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Thu, 17 Jul 2025 15:32:30 -0700
Subject: [PATCH 07/12] Hardcode input/output dimensions of local laplacian

---
 apps/harris/CMakeLists.txt                         |  2 +-
 apps/local_laplacian/local_laplacian_generator.cpp | 13 +++++++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
index 2704fb448807..dff4b192816f 100644
--- a/apps/harris/CMakeLists.txt
+++ b/apps/harris/CMakeLists.txt
@@ -16,7 +16,7 @@ add_halide_generator(harris.generator SOURCES harris_generator.cpp)
 
 set(_harris_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(Halide_TARGET MATCHES "opencl|metal")
+if(Halide_TARGET MATCHES "opencl|metal|cuda|vulkan")
     # Set last_level_cache per GPU block to an extremely small value. This
     # eliminates all `.compute_at` in the generated schedules, which in turn
     # eliminates all GPU shared memory allocations.
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
index 860540e74517..6b540ea09a34 100644
--- a/apps/local_laplacian/local_laplacian_generator.cpp
+++ b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -97,6 +97,19 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         // Provide estimates on the pipeline output
         output.set_estimates({{0, 1536}, {0, 2560}, {0, 3}});
 
+        // Hardcode the input and output dimensions to suppress the OpenCL/Metal
+        // launch failure:
+        //
+        //  OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
+        //  failed
+        input.dim(0).set_bounds(0, 1536).set_stride(1);
+        input.dim(1).set_bounds(0, 2560).set_stride(1536);
+        input.dim(2).set_bounds(0, 3).set_stride(1536 * 2560);
+
+        output.dim(0).set_bounds(0, 1536).set_stride(1);
+        output.dim(1).set_bounds(0, 2560).set_stride(1536);
+        output.dim(2).set_bounds(0, 3).set_stride(1536 * 2560);
+
         /* THE SCHEDULE */
         if (using_autoscheduler()) {
             // Nothing.

From 04f563b0eb73804aefa868d4b2f7217a7d69aa1c Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Fri, 18 Jul 2025 10:32:47 -0700
Subject: [PATCH 08/12] Explicitly skipping local_laplacian tests for GPU
 targets

---
 apps/local_laplacian/CMakeLists.txt | 15 ++++-----
 apps/local_laplacian/process.cpp    | 48 +++++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 2fa4f8dc4b51..13147cecd1a1 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -18,19 +18,12 @@ add_halide_generator(local_laplacian.generator
 
 set(_local_laplacian_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
 
-if(Halide_TARGET MATCHES "cuda")
+if(Halide_TARGET MATCHES "cuda|metal|opencl|vulkan")
     # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
     # tuned to pass the Builbot tests.
     list(APPEND _local_laplacian_autoscheduler_params
         autoscheduler.last_level_cache_size=10000
     )
-elseif(Halide_TARGET MATCHES "metal|opencl|vulkan")
-    # The pipeline is shared GPU memory bounded. Limit the parallelism to
-    # minimal value (=32) to cap the shared GPU memory size.
-    list(APPEND _local_laplacian_autoscheduler_params
-        autoscheduler.last_level_cache_size=1000
-        autoscheduler.parallelism=32
-    )
 endif()
 
 # Filters
@@ -58,5 +51,9 @@ if (EXISTS ${IMAGE})
     set_tests_properties(local_laplacian_process PROPERTIES
                          LABELS local_laplacian
                          PASS_REGULAR_EXPRESSION "Success!"
-                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
+                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
+                         # Pass in the keyword "metal" etc to skip the test
+                         # explicitly. Buildbot can print a nice test report
+                         # for all skipped tests.
+                         ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 endif ()
diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
index 389ddbfb1b1d..6ae411a7604b 100644
--- a/apps/local_laplacian/process.cpp
+++ b/apps/local_laplacian/process.cpp
@@ -1,5 +1,7 @@
 #include <chrono>
 #include <cstdio>
+#include <cstdlib>
+#include <regex>
 
 #include "local_laplacian.h"
 #ifndef NO_AUTO_SCHEDULE
@@ -13,6 +15,48 @@
 using namespace Halide::Runtime;
 using namespace Halide::Tools;
 
+namespace {
+
+enum DeviceState {
+    IS_CUDA,
+    NOT_CUDA,
+    ENV_VARIABLE_ABSENT,
+};
+DeviceState ensure_cuda_device() {
+    const auto hl_target = std::getenv("HL_TARGET");
+    if (hl_target == nullptr) {
+        printf("Warning: Environment variable HL_TARGET not specified. "
+               "Proceeding to the tests...\n");
+        return ENV_VARIABLE_ABSENT;
+    }
+
+    if (std::regex_search(hl_target, std::regex{"cuda|metal|vulkan|opencl"})) {
+        // note(antonysigma): Error messages if we don't skip the test:
+        //
+        // OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
+        // failed
+        //
+        // 2025-07-17 17:24:32.170 local_laplacian_process[63513:6587844] Metal
+        // API Validation Enabled -[MTLDebugComputeCommandEncoder
+        // _validateThreadsPerThreadgroup:]:1266: failed assertion
+        // `(threadsPerThreadgroup.width(62) * threadsPerThreadgroup.height(32)
+        // * threadsPerThreadgroup.depth(1))(1984) must be <= 1024. (device
+        // threadgroup size limit)'
+        //
+        // Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
+        printf("[SKIP] Mullapudi2016 experimental GPU schedules "
+               "generates the gpu_threads where thread count per block "
+               "is not an multiple of 32. Target = %s. Skipping...\n",
+               hl_target);
+
+        return NOT_CUDA;
+    }
+
+    return IS_CUDA;
+}
+
+}  // namespace
+
 int main(int argc, char **argv) {
     if (argc < 7) {
         printf("Usage: ./process input.png levels alpha beta timing_iterations output.png\n"
@@ -20,6 +64,10 @@ int main(int argc, char **argv) {
         return 1;
     }
 
+    if (ensure_cuda_device() == NOT_CUDA) {
+        return 0;
+    }
+
     // Input may be a PNG8
     Buffer<uint16_t, 3> input = load_and_convert_image(argv[1]);
 

From 1468c7886dbf26706d04f8f0ae3bd702579e21c5 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Fri, 18 Jul 2025 13:34:43 -0700
Subject: [PATCH 09/12] Skip iir_blur tests on Metal and OpenCL devices

---
 apps/iir_blur/CMakeLists.txt     | 25 ++++++++-----------
 apps/iir_blur/filter.cpp         | 43 ++++++++++++++++++++++++++++++++
 apps/local_laplacian/process.cpp |  4 +--
 apps/unsharp/CMakeLists.txt      | 23 ++++++++++++++++-
 4 files changed, 78 insertions(+), 17 deletions(-)

diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
index 0ca3233968f3..586bdc83e8a5 100644
--- a/apps/iir_blur/CMakeLists.txt
+++ b/apps/iir_blur/CMakeLists.txt
@@ -32,18 +32,15 @@ target_link_libraries(iir_blur_filter PRIVATE
 # Test that the app actually works!
 set(IMAGE ${CMAKE_CURRENT_LIST_DIR}/../images/rgb.png)
 if (EXISTS ${IMAGE})
-    if (Halide_TARGET MATCHES "opencl")
-        # Error message:
-        #
-        # Error: OpenCL error: CL_INVALID_COMMAND_QUEUE clFinish failed
-        message(WARNING "Skipping Mullapudi2016's GPU auto-schedules for OpenCL target.")
-    else ()
-        configure_file(${IMAGE} rgb.png COPYONLY)
-        add_test(NAME iir_blur_filter
-                COMMAND iir_blur_filter rgb.png out.png)
-        set_tests_properties(iir_blur_filter PROPERTIES
-                            LABELS iir_blur
-                            PASS_REGULAR_EXPRESSION "Success!"
-                            SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
-    endif ()
+    configure_file(${IMAGE} rgb.png COPYONLY)
+    add_test(NAME iir_blur_filter
+            COMMAND iir_blur_filter rgb.png out.png)
+    set_tests_properties(iir_blur_filter PROPERTIES
+                        LABELS iir_blur
+                        PASS_REGULAR_EXPRESSION "Success!"
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
+                        # Pass in the keyword "metal" etc to skip the test
+                        # explicitly. Buildbot can print a nice test report
+                        # for all skipped tests.
+                        ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
 endif ()
diff --git a/apps/iir_blur/filter.cpp b/apps/iir_blur/filter.cpp
index fe0abd45ff79..2eb9f3aa8b93 100644
--- a/apps/iir_blur/filter.cpp
+++ b/apps/iir_blur/filter.cpp
@@ -1,6 +1,7 @@
 #include <cassert>
 #include <cstdio>
 #include <cstdlib>
+#include <regex>
 
 #include "HalideBuffer.h"
 #include "HalideRuntime.h"
@@ -13,12 +14,54 @@
 
 using namespace Halide::Tools;
 
+namespace {
+
+enum DeviceState {
+    USING_METAL_OR_OPENCL,
+    NOT_METAL_OR_OPENCL,
+    ENV_VARIABLE_ABSENT,
+};
+DeviceState ensure_cuda_device() {
+    const auto hl_target = std::getenv("HL_TARGET");
+    if (hl_target == nullptr) {
+        printf("Warning: Environment variable HL_TARGET not specified. "
+               "Proceeding to the tests...\n");
+        return ENV_VARIABLE_ABSENT;
+    }
+
+    if (std::regex_search(hl_target, std::regex{"metal|opencl"})) {
+        // note(antonysigma): Error messages if we don't skip the test:
+        //
+        // OpenCL error: clFinish timeout.
+        //
+        // Metal: copy_to_host() failed. Error
+        // Domain=MTLCommandBufferErrorDomain Code=2 "Caused GPU Timeout Error
+        // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)"
+        // UserInfo={NSLocalizedDescription=Caused GPU Timeout Error
+        // (00000002:kIOAccelCommandBufferCallbackErrorTimeout)}
+        printf("[SKIP] Mullapudi2016 experimental GPU schedule "
+               "generates copy_to_host() function calls that timeout. "
+               "Target = %s. Skipping...\n",
+               hl_target);
+
+        return USING_METAL_OR_OPENCL;
+    }
+
+    return NOT_METAL_OR_OPENCL;
+}
+
+}  // namespace
+
 int main(int argc, char **argv) {
     if (argc != 3) {
         printf("Usage: %s in out\n", argv[0]);
         return 1;
     }
 
+    if (ensure_cuda_device() == USING_METAL_OR_OPENCL) {
+        return 0;
+    }
+
     Halide::Runtime::Buffer<float, 3> input = load_and_convert_image(argv[1]);
     Halide::Runtime::Buffer<float, 3> output(input.width(), input.height(), input.channels());
 
diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
index 6ae411a7604b..46c5712cab19 100644
--- a/apps/local_laplacian/process.cpp
+++ b/apps/local_laplacian/process.cpp
@@ -30,7 +30,7 @@ DeviceState ensure_cuda_device() {
         return ENV_VARIABLE_ABSENT;
     }
 
-    if (std::regex_search(hl_target, std::regex{"cuda|metal|vulkan|opencl"})) {
+    if (std::regex_search(hl_target, std::regex{"metal|vulkan|opencl"})) {
         // note(antonysigma): Error messages if we don't skip the test:
         //
         // OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
@@ -44,7 +44,7 @@ DeviceState ensure_cuda_device() {
         // threadgroup size limit)'
         //
         // Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
-        printf("[SKIP] Mullapudi2016 experimental GPU schedules "
+        printf("[SKIP] Mullapudi2016 experimental GPU schedule "
                "generates the gpu_threads where thread count per block "
                "is not an multiple of 32. Target = %s. Skipping...\n",
                hl_target);
diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt
index 79434ec5f055..747fac9b403c 100644
--- a/apps/unsharp/CMakeLists.txt
+++ b/apps/unsharp/CMakeLists.txt
@@ -14,12 +14,33 @@ find_package(Halide REQUIRED)
 # Generator
 add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp)
 
+set(_unsharp_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
+
+if(Halide_TARGET MATCHES "cuda|opencl|vulkan")
+    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
+    # tuned to pass the Builbot tests.
+    list(APPEND _unsharp_autoscheduler_params
+        autoscheduler.last_level_cache_size=20000
+    )
+elseif(Halide_TARGET MATCHES "metal")
+    # Resolving Metal error regarding the threads per GPU block limit:
+    #
+    # -[MTLDebugComputeCommandEncoder _validateThreadsPerThreadgroup:]:1267:
+    # failed assertion `(threadsPerThreadgroup.width(70) *
+    # threadsPerThreadgroup.height(8) * threadsPerThreadgroup.depth(1))(560)
+    # must be <= 448. (kernel threadgroup size limit)`
+    list(APPEND _unsharp_autoscheduler_params
+        autoscheduler.last_level_cache_size=20000
+        autoscheduler.parallelism=32
+    )
+endif()
+
 # Filters
 add_halide_library(unsharp FROM unsharp.generator)
 add_halide_library(unsharp_auto_schedule FROM unsharp.generator
                    GENERATOR unsharp
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS ${_unsharp_autoscheduler_params})
 
 # Main executable
 add_executable(unsharp_filter filter.cpp)

From d6d18228232b38bb276f6ccf02ab28f3e250687f Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Mon, 21 Jul 2025 10:34:38 -0700
Subject: [PATCH 10/12] Retrieve target string from metadata

---
 apps/iir_blur/CMakeLists.txt        | 6 +-----
 apps/iir_blur/filter.cpp            | 9 ++++-----
 apps/local_laplacian/CMakeLists.txt | 6 +-----
 apps/local_laplacian/process.cpp    | 9 ++++-----
 4 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
index 586bdc83e8a5..474af15dcf35 100644
--- a/apps/iir_blur/CMakeLists.txt
+++ b/apps/iir_blur/CMakeLists.txt
@@ -38,9 +38,5 @@ if (EXISTS ${IMAGE})
     set_tests_properties(iir_blur_filter PROPERTIES
                         LABELS iir_blur
                         PASS_REGULAR_EXPRESSION "Success!"
-                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
-                        # Pass in the keyword "metal" etc to skip the test
-                        # explicitly. Buildbot can print a nice test report
-                        # for all skipped tests.
-                        ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
+                        SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/iir_blur/filter.cpp b/apps/iir_blur/filter.cpp
index 2eb9f3aa8b93..e94b57c621d9 100644
--- a/apps/iir_blur/filter.cpp
+++ b/apps/iir_blur/filter.cpp
@@ -1,6 +1,5 @@
 #include <cassert>
 #include <cstdio>
-#include <cstdlib>
 #include <regex>
 
 #include "HalideBuffer.h"
@@ -19,14 +18,14 @@ namespace {
 enum DeviceState {
     USING_METAL_OR_OPENCL,
     NOT_METAL_OR_OPENCL,
-    ENV_VARIABLE_ABSENT,
+    METADATA_ABSENT,
 };
 DeviceState ensure_cuda_device() {
-    const auto hl_target = std::getenv("HL_TARGET");
+    const auto hl_target = iir_blur_auto_schedule_metadata()->target;
     if (hl_target == nullptr) {
-        printf("Warning: Environment variable HL_TARGET not specified. "
+        printf("Warning: variable *_metadata()->target not specified. "
                "Proceeding to the tests...\n");
-        return ENV_VARIABLE_ABSENT;
+        return METADATA_ABSENT;
     }
 
     if (std::regex_search(hl_target, std::regex{"metal|opencl"})) {
diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 13147cecd1a1..00cd7d498b63 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -51,9 +51,5 @@ if (EXISTS ${IMAGE})
     set_tests_properties(local_laplacian_process PROPERTIES
                          LABELS local_laplacian
                          PASS_REGULAR_EXPRESSION "Success!"
-                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]"
-                         # Pass in the keyword "metal" etc to skip the test
-                         # explicitly. Buildbot can print a nice test report
-                         # for all skipped tests.
-                         ENVIRONMENT "HL_TARGET=${Halide_TARGET}")
+                         SKIP_REGULAR_EXPRESSION "\\[SKIP\\]")
 endif ()
diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
index 46c5712cab19..fb92cf6cd006 100644
--- a/apps/local_laplacian/process.cpp
+++ b/apps/local_laplacian/process.cpp
@@ -1,6 +1,5 @@
 #include <chrono>
 #include <cstdio>
-#include <cstdlib>
 #include <regex>
 
 #include "local_laplacian.h"
@@ -20,14 +19,14 @@ namespace {
 enum DeviceState {
     IS_CUDA,
     NOT_CUDA,
-    ENV_VARIABLE_ABSENT,
+    METADATA_ABSENT,
 };
 DeviceState ensure_cuda_device() {
-    const auto hl_target = std::getenv("HL_TARGET");
+    const auto hl_target = local_laplacian_auto_schedule_metadata()->target;
     if (hl_target == nullptr) {
-        printf("Warning: Environment variable HL_TARGET not specified. "
+        printf("Warning: variable *_metadata()->target not specified. "
                "Proceeding to the tests...\n");
-        return ENV_VARIABLE_ABSENT;
+        return METADATA_ABSENT;
     }
 
     if (std::regex_search(hl_target, std::regex{"metal|vulkan|opencl"})) {

From dc105a0e2b881f72152dce1f6f9a04259bb3eca3 Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Mon, 21 Jul 2025 10:49:04 -0700
Subject: [PATCH 11/12] Eliminate compute_at in unsharp_filter for metal device

---
 apps/unsharp/CMakeLists.txt | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt
index 747fac9b403c..66fdb1afa285 100644
--- a/apps/unsharp/CMakeLists.txt
+++ b/apps/unsharp/CMakeLists.txt
@@ -30,9 +30,7 @@ elseif(Halide_TARGET MATCHES "metal")
     # threadsPerThreadgroup.height(8) * threadsPerThreadgroup.depth(1))(560)
     # must be <= 448. (kernel threadgroup size limit)`
     list(APPEND _unsharp_autoscheduler_params
-        autoscheduler.last_level_cache_size=20000
-        autoscheduler.parallelism=32
-    )
+        autoscheduler.last_level_cache_size=1000)
 endif()
 
 # Filters

From b46e0ff005f3b78cf03d699c3e15ac250c99d5bb Mon Sep 17 00:00:00 2001
From: Antony Chan <antony.sigma@gmail.com>
Date: Tue, 22 Jul 2025 13:59:38 -0700
Subject: [PATCH 12/12] Always disable experimental GPU autoscheduler for Local
 Laplacian

---
 apps/local_laplacian/CMakeLists.txt           | 30 +++++++-----
 .../local_laplacian_generator.cpp             | 13 -----
 apps/local_laplacian/process.cpp              | 47 -------------------
 3 files changed, 17 insertions(+), 73 deletions(-)

diff --git a/apps/local_laplacian/CMakeLists.txt b/apps/local_laplacian/CMakeLists.txt
index 00cd7d498b63..a81978731e14 100644
--- a/apps/local_laplacian/CMakeLists.txt
+++ b/apps/local_laplacian/CMakeLists.txt
@@ -16,24 +16,28 @@ add_halide_generator(local_laplacian.generator
                      SOURCES local_laplacian_generator.cpp
                      LINK_LIBRARIES Halide::Tools)
 
-set(_local_laplacian_autoscheduler_params autoscheduler.experimental_gpu_schedule=1)
-
-if(Halide_TARGET MATCHES "cuda|metal|opencl|vulkan")
-    # Last level cache size estimate of the Nvidia GPU on the Buildbot. Hand
-    # tuned to pass the Builbot tests.
-    list(APPEND _local_laplacian_autoscheduler_params
-        autoscheduler.last_level_cache_size=10000
-    )
-endif()
-
 # Filters
 add_halide_library(local_laplacian FROM local_laplacian.generator)
 add_halide_library(local_laplacian_auto_schedule FROM local_laplacian.generator
                    GENERATOR local_laplacian
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   # When target=host-cuda or host-metal, limit the GPU shared
-                   # memory per block to avoid gpu kernel launch failure.
-                   PARAMS ${_local_laplacian_autoscheduler_params})
+                   # note(antonysigma): Works on CUDA and CPU targets, but not
+                   # others. Error messages if we don't skip the test:
+                   #
+                   # OpenCL error: CL_INVALID_WORK_GROUP_SIZE
+                   # clEnqueueNDRangeKernel failed
+                   #
+                   # 2025-07-17 17:24:32.170
+                   # local_laplacian_process[63513:6587844] Metal API Validation
+                   # Enabled -[MTLDebugComputeCommandEncoder
+                   # _validateThreadsPerThreadgroup:]:1266: failed assertion
+                   # `(threadsPerThreadgroup.width(62) *
+                   # threadsPerThreadgroup.height(32)
+                   # * threadsPerThreadgroup.depth(1))(1984) must be <= 1024.
+                   # (device threadgroup size limit)'
+                   #
+                   # Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
+                   PARAMS autoscheduler.experimental_gpu_schedule=0)
 
 # Main executable
 add_executable(local_laplacian_process process.cpp)
diff --git a/apps/local_laplacian/local_laplacian_generator.cpp b/apps/local_laplacian/local_laplacian_generator.cpp
index 6b540ea09a34..860540e74517 100644
--- a/apps/local_laplacian/local_laplacian_generator.cpp
+++ b/apps/local_laplacian/local_laplacian_generator.cpp
@@ -97,19 +97,6 @@ class LocalLaplacian : public Halide::Generator<LocalLaplacian> {
         // Provide estimates on the pipeline output
         output.set_estimates({{0, 1536}, {0, 2560}, {0, 3}});
 
-        // Hardcode the input and output dimensions to suppress the OpenCL/Metal
-        // launch failure:
-        //
-        //  OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
-        //  failed
-        input.dim(0).set_bounds(0, 1536).set_stride(1);
-        input.dim(1).set_bounds(0, 2560).set_stride(1536);
-        input.dim(2).set_bounds(0, 3).set_stride(1536 * 2560);
-
-        output.dim(0).set_bounds(0, 1536).set_stride(1);
-        output.dim(1).set_bounds(0, 2560).set_stride(1536);
-        output.dim(2).set_bounds(0, 3).set_stride(1536 * 2560);
-
         /* THE SCHEDULE */
         if (using_autoscheduler()) {
             // Nothing.
diff --git a/apps/local_laplacian/process.cpp b/apps/local_laplacian/process.cpp
index fb92cf6cd006..389ddbfb1b1d 100644
--- a/apps/local_laplacian/process.cpp
+++ b/apps/local_laplacian/process.cpp
@@ -1,6 +1,5 @@
 #include <chrono>
 #include <cstdio>
-#include <regex>
 
 #include "local_laplacian.h"
 #ifndef NO_AUTO_SCHEDULE
@@ -14,48 +13,6 @@
 using namespace Halide::Runtime;
 using namespace Halide::Tools;
 
-namespace {
-
-enum DeviceState {
-    IS_CUDA,
-    NOT_CUDA,
-    METADATA_ABSENT,
-};
-DeviceState ensure_cuda_device() {
-    const auto hl_target = local_laplacian_auto_schedule_metadata()->target;
-    if (hl_target == nullptr) {
-        printf("Warning: variable *_metadata()->target not specified. "
-               "Proceeding to the tests...\n");
-        return METADATA_ABSENT;
-    }
-
-    if (std::regex_search(hl_target, std::regex{"metal|vulkan|opencl"})) {
-        // note(antonysigma): Error messages if we don't skip the test:
-        //
-        // OpenCL error: CL_INVALID_WORK_GROUP_SIZE clEnqueueNDRangeKernel
-        // failed
-        //
-        // 2025-07-17 17:24:32.170 local_laplacian_process[63513:6587844] Metal
-        // API Validation Enabled -[MTLDebugComputeCommandEncoder
-        // _validateThreadsPerThreadgroup:]:1266: failed assertion
-        // `(threadsPerThreadgroup.width(62) * threadsPerThreadgroup.height(32)
-        // * threadsPerThreadgroup.depth(1))(1984) must be <= 1024. (device
-        // threadgroup size limit)'
-        //
-        // Vulkan: vkQueueWaitIdle returned VK_ERROR_DEVICE_LOST
-        printf("[SKIP] Mullapudi2016 experimental GPU schedule "
-               "generates the gpu_threads where thread count per block "
-               "is not an multiple of 32. Target = %s. Skipping...\n",
-               hl_target);
-
-        return NOT_CUDA;
-    }
-
-    return IS_CUDA;
-}
-
-}  // namespace
-
 int main(int argc, char **argv) {
     if (argc < 7) {
         printf("Usage: ./process input.png levels alpha beta timing_iterations output.png\n"
@@ -63,10 +20,6 @@ int main(int argc, char **argv) {
         return 1;
     }
 
-    if (ensure_cuda_device() == NOT_CUDA) {
-        return 0;
-    }
-
     // Input may be a PNG8
     Buffer<uint16_t, 3> input = load_and_convert_image(argv[1]);