Replace or emplace outer dimensions for GPU schedules

antonysigma · antonysigma · commit e46ac2c9e64f · 2025-07-10T15:12:12.000-07:00
In the nested parallelism scheduling algorithm, whenever the dimension
is marked for GPU acceleration, e.g. `y -&gt; y_i, y_o`, replace the
corresponding variable `y` with `y_o` in `outer_dims`.

This ensures the internal assertion `dims.size() &gt;= outer_dims.size()`
is always true for GPU schedules.

The immediate effect is that for a downstream stage having GPU
schedules: `g.gpu_tile(x, xi, xo, ...)`, the upstream stage correctly
specifies the dimension `xo` by `f.compute_at(g, xo)`. This is in
accordance to the original design intent of the Mullapudi2016 paper.

As a result, the GPU IR correctly synthesizes shared GPU memory to cache
the intermediate results of stage `f`, optimizing for caching.

---

Also, for all stages at are `computed_at`, mark all vectorizable inner
dimensions as `gpu_threads`.

---

In the correctness tests at `test/autoscheduler/mullapudi/*.cpp` and
performance regression tests at `apps/*`, down adjust the estimated GPU
shared memory limit by specifying `autoscheduler.last_level_cache_size
&lt;= 10000`. Except for pipline `conv_layer`, all pipelines should observe
an improvement of caching.
diff --git a/apps/bgu/CMakeLists.txt b/apps/bgu/CMakeLists.txt
@@ -19,16 +19,7 @@ add_halide_library(bgu FROM bgu.generator)
 add_halide_library(bgu_auto_schedule FROM bgu.generator
                    GENERATOR bgu
                    AUTOSCHEDULER Halide::Mullapudi2016
-# Note(antonysigma): experimental GPU schedule failed on the Buildbot worker
-# "halide-testbranch-main-llvm18-x86-64-linux-cmake" with error:
-#
-# CUDA error: CUDA_ERROR_ILLEGAL_ADDRESS cuCtxSynchronize failed
-#
-# Curiously, it works on a low-end GPU: Nvidia GTX 1660S.
-#
-# Uncomment the following code to debug. PARAMS
-# autoscheduler.experimental_gpu_schedule=1
-)
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(bgu_filter filter.cpp)
diff --git a/apps/camera_pipe/CMakeLists.txt b/apps/camera_pipe/CMakeLists.txt
@@ -20,7 +20,8 @@ add_halide_generator(camera_pipe.generator
 add_halide_library(camera_pipe FROM camera_pipe.generator)
 add_halide_library(camera_pipe_auto_schedule FROM camera_pipe.generator
                    GENERATOR camera_pipe
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS autoscheduler.last_level_cache_size=10000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(camera_pipe_process process.cpp)
diff --git a/apps/harris/CMakeLists.txt b/apps/harris/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(harris FROM harris.generator)
 add_halide_library(harris_auto_schedule FROM harris.generator
                    GENERATOR harris
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(harris_filter filter.cpp)
diff --git a/apps/iir_blur/CMakeLists.txt b/apps/iir_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(iir_blur FROM iir_blur.generator)
 add_halide_library(iir_blur_auto_schedule FROM iir_blur.generator
                    GENERATOR iir_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(iir_blur_filter filter.cpp)
diff --git a/apps/lens_blur/CMakeLists.txt b/apps/lens_blur/CMakeLists.txt
@@ -19,7 +19,7 @@ add_halide_library(lens_blur FROM lens_blur.generator)
 add_halide_library(lens_blur_auto_schedule FROM lens_blur.generator
                    GENERATOR lens_blur
                    AUTOSCHEDULER Halide::Mullapudi2016
-                   PARAMS autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.parallelism=4096 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(lens_blur_filter process.cpp)
diff --git a/apps/stencil_chain/CMakeLists.txt b/apps/stencil_chain/CMakeLists.txt
@@ -18,7 +18,11 @@ add_halide_generator(stencil_chain.generator SOURCES stencil_chain_generator.cpp
 add_halide_library(stencil_chain FROM stencil_chain.generator)
 add_halide_library(stencil_chain_auto_schedule FROM stencil_chain.generator
                    GENERATOR stencil_chain
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   # When target=host-cuda or host-metal, limit the GPU shared
+                   # memory per block to avoid gpu kernel launch failure.
+                   PARAMS autoscheduler.last_level_cache_size=1000 autoscheduler.experimental_gpu_schedule=1
+                   )
 
 # Main executable
 add_executable(stencil_chain_process process.cpp)
diff --git a/apps/unsharp/CMakeLists.txt b/apps/unsharp/CMakeLists.txt
@@ -18,7 +18,8 @@ add_halide_generator(unsharp.generator SOURCES unsharp_generator.cpp)
 add_halide_library(unsharp FROM unsharp.generator)
 add_halide_library(unsharp_auto_schedule FROM unsharp.generator
                    GENERATOR unsharp
-                   AUTOSCHEDULER Halide::Mullapudi2016)
+                   AUTOSCHEDULER Halide::Mullapudi2016
+                   PARAMS autoscheduler.last_level_cache_size=20000 autoscheduler.experimental_gpu_schedule=1)
 
 # Main executable
 add_executable(unsharp_filter filter.cpp)
diff --git a/src/autoschedulers/mullapudi2016/AutoSchedule.cpp b/src/autoschedulers/mullapudi2016/AutoSchedule.cpp
@@ -123,6 +123,21 @@ string get_sanitized_name(string name) {
     return name;
 }
 
+// Similar to std::replace, but assuming the vector contains unique values. And
+// if the element is absent, append new value to the end of vector.
+void replace_or_emplace(std::vector<VarOrRVar> &dims_, const VarOrRVar &before, VarOrRVar after) {
+    auto iter = std::find_if(dims_.begin(), dims_.end(),
+                             [before_name = before.name()](const VarOrRVar &d) {
+                                 return d.name() == before_name;
+                             });
+    const bool is_found = (iter != dims_.end());
+    if (is_found) {
+        *iter = std::move(after);
+    } else {
+        dims_.emplace_back(std::move(after));
+    }
+}
+
 // Representation of a function stage in the pipeline.
 struct FStage {
     Function func;
@@ -1426,8 +1441,11 @@ class GPUTilingDedup {
             threads_budget = simplify(max(threads_budget / new_entry.factor, 1));
         }
 
-        if (!is_already_split) {
-            helper.commit(sched, is_compute_at);
+        helper.commit(sched, is_compute_at);
+        if (is_compute_at) {
+            // There are dimensions that does not need splitting but marked as
+            // vectorizable. Mark them as gpu threads.
+            mark_gpu_threads(sched);
         }
 
         // After calling `gpu_tiles` from `GPUTileHelper::commit()`, a few of
@@ -3425,7 +3443,8 @@ void Partitioner::generate_group_cpu_schedule(
                     if (parallelized_split) {
                         auto split_vars = *parallelized_split;
                         inner_dims.emplace_back(split_vars.inner);
-                        outer_dims.emplace_back(split_vars.outer);
+
+                        replace_or_emplace(outer_dims, v, split_vars.outer);
                     }
                 } else {
                     f_handle.parallel(v);
diff --git a/test/autoschedulers/mullapudi2016/large_window.cpp b/test/autoschedulers/mullapudi2016/large_window.cpp
@@ -47,7 +47,13 @@ int main(int argc, char **argv) {
     Target target = get_jit_target_from_environment();
     Pipeline p(g);
 
-    p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
+    constexpr Mullapudi2016TestParams gpu_specifications{
+        /* .last_level_cache_size = */ 35'000,
+        /* .parallelism = */ 128,
+    };
+
+    p.apply_autoscheduler(target,
+                          get_mullapudi2016_test_params(target.has_gpu_feature(), {gpu_specifications}));
 
     // Inspect the schedule (only for debugging))
     // g.print_loop_nest();
diff --git a/test/autoschedulers/mullapudi2016/reorder.cpp b/test/autoschedulers/mullapudi2016/reorder.cpp
@@ -82,21 +82,11 @@ double run_test_2(bool auto_schedule) {
         // Provide estimates on the pipeline output
         diff.set_estimates({{0, left_im.width()}, {0, left_im.height()}, {0, 32}, {0, 3}});
 
-        // Auto-schedule the pipeline
-        //
-        // Increasing the GPU's active warp count estimate (aka parallelism)
-        // from 128 to 2048 to disable the Autoscheduler's grid-stride loop
-        // feature. At small parallelism value, the autoscheduler correctly
-        // designates dimension 'z' as the stride axis in the GPU grid-stride
-        // loop, which improves thread occupancy. However, it fails to reorder
-        // 'z' inside the gpu_blocks 'xo' and 'yo', which is required for proper
-        // loop nesting and successful code generation.
-        //
-        // Reference:
-        // https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
+        // note(antonysigma): Reducing the GPU's shared memory size estimate so that the GPU kernel
+        // can launch on consumer-grade GPUs.
         constexpr Mullapudi2016TestParams gpu_specifications{
-            /* .last_level_cache_size = */ 47'000,
-            /* .parallelism = */ 2048,
+            /* .last_level_cache_size = */ 20'000,
+            /* .parallelism = */ 128,
         };
 
         p.apply_autoscheduler(
@@ -139,16 +129,9 @@ double run_test_3(bool auto_schedule) {
     if (auto_schedule) {
         // Provide estimates on the pipeline output
         r.set_estimates({{0, 1024}, {0, 1024}, {0, 3}});
-        // Auto-schedule the pipeline
-        //
-        // Disabling this experimental GPU feature because the autoscheduler correctly
-        // identifies reduction domain 'r.x' as the stride axis for the GPU grid-stride loop,
-        // which helps retain threads efficiently. However, it fails to reorder 'r.x'
-        // inside the loop nests of gpu_blocks 'xo' and 'yo', which is necessary for
-        // successful code generation.
-        //
-        // Reference: https://developer.nvidia.com/blog/cuda-pro-tip-write-flexible-kernels-grid-stride-loops/
-        p.apply_autoscheduler(target, get_mullapudi2016_test_params(target.has_gpu_feature()));
+
+        p.apply_autoscheduler(target,
+                              get_mullapudi2016_test_params(target.has_gpu_feature()));
     } else {
         Var par("par");
         r.update(0).fuse(c, y, par).parallel(par).reorder(x, dom.x, dom.y).vectorize(x, 4);