Add Windows build jobs to CI

stotko · stotko · commit 49bcd9cee573 · 2025-09-29T11:11:28.000+02:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         os:
-          ["ubuntu-22.04"] # "windows-2025" # Disabled until solution/workaround for NVTX is present
+          ["ubuntu-22.04", "windows-2025"]
           # "ubuntu-24.04" # Postponed as long as testing against CUDA 12.1, needs 12.6+
         python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
@@ -55,9 +55,9 @@ jobs:
 
       - name: Set up CUDA toolkit (Windows)
         if: runner.os == 'Windows'
-        uses: Jimver/cuda-toolkit@master
+        uses: Jimver/cuda-toolkit@v0.2.24 # https://github.com/Jimver/cuda-toolkit/issues/395
         with:
-          cuda: "12.4.0"
+          cuda: "12.8.1"
           method: "network"
 
       - name: Install torch with CUDA support (Ubuntu)
@@ -66,7 +66,7 @@ jobs:
 
       - name: Install torch with CUDA support (Windows)
         if: runner.os == 'Windows'
-        run: python -m pip install torch --index-url https://download.pytorch.org/whl/cu124
+        run: python -m pip install torch --index-url https://download.pytorch.org/whl/cu128
 
       - name: Install torchhull
         run: python -m pip install --editable ".[dev]"
diff --git a/src/torchhull/_C/src/gaussian_blur_cuda.cu b/src/torchhull/_C/src/gaussian_blur_cuda.cu
@@ -447,6 +447,20 @@ gaussian_blur_cuda_sparse(const torch::Tensor& images,
     dim3 grid_convolution;
     at::cuda::getApplyGrid(M, grid_convolution, images.device().index(), threads_per_block);
 
+#define CASE_TILE_CONVOLUTION_KERNEL_SPECIALIZED(KERNEL_SIZE)                                                          \
+    case KERNEL_SIZE:                                                                                                  \
+    {                                                                                                                  \
+        tile_convolution_kernel_specialized<KERNEL_SIZE><<<grid_convolution, threads, 0, stream>>>(tile_indices_,      \
+                                                                                                   M,                  \
+                                                                                                   tile_size,          \
+                                                                                                   sigma,              \
+                                                                                                   images_,            \
+                                                                                                   blurred_images_);   \
+        DEFER(AT_CUDA_CHECK(cudaGetLastError());)                                                                      \
+        DEFER(AT_CUDA_CHECK(cudaStreamSynchronize(stream));)                                                           \
+    }                                                                                                                  \
+    break;
+
     AT_DISPATCH_ALL_TYPES_AND(
             torch::ScalarType::Half,
             images.scalar_type(),
@@ -464,20 +478,6 @@ gaussian_blur_cuda_sparse(const torch::Tensor& images,
                             auto blurred_images_ =
                                     blurred_images.packed_accessor64<scalar_t, 4, torch::RestrictPtrTraits>();
 
-#define CASE_TILE_CONVOLUTION_KERNEL_SPECIALIZED(KERNEL_SIZE)                                                          \
-    case KERNEL_SIZE:                                                                                                  \
-    {                                                                                                                  \
-        tile_convolution_kernel_specialized<KERNEL_SIZE><<<grid_convolution, threads, 0, stream>>>(tile_indices_,      \
-                                                                                                   M,                  \
-                                                                                                   tile_size,          \
-                                                                                                   sigma,              \
-                                                                                                   images_,            \
-                                                                                                   blurred_images_);   \
-        DEFER(AT_CUDA_CHECK(cudaGetLastError());)                                                                      \
-        DEFER(AT_CUDA_CHECK(cudaStreamSynchronize(stream));)                                                           \
-    }                                                                                                                  \
-    break;
-
                             switch (kernel_size)
                             {
                                 // Tested all possible values up to 21 for specialization.