Add Windows build jobs to CI

stotko · stotko · commit fa039de29996 · 2025-09-29T13:45:00.000+02:00
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -18,7 +18,7 @@ jobs:
       fail-fast: false
       matrix:
         os:
-          ["ubuntu-22.04"] # "windows-2025" # Disabled until solution/workaround for NVTX is present
+          ["ubuntu-22.04", "windows-2025"]
           # "ubuntu-24.04" # Postponed as long as testing against CUDA 12.1, needs 12.6+
         python: ["3.9", "3.10", "3.11", "3.12", "3.13"]
 
@@ -55,9 +55,9 @@ jobs:
 
       - name: Set up CUDA toolkit (Windows)
         if: runner.os == 'Windows'
-        uses: Jimver/cuda-toolkit@master
+        uses: Jimver/cuda-toolkit@v0.2.24 # https://github.com/Jimver/cuda-toolkit/issues/395
         with:
-          cuda: "12.4.0"
+          cuda: "12.8.1"
           method: "network"
 
       - name: Install torch with CUDA support (Ubuntu)
@@ -66,7 +66,7 @@ jobs:
 
       - name: Install torch with CUDA support (Windows)
         if: runner.os == 'Windows'
-        run: python -m pip install torch --index-url https://download.pytorch.org/whl/cu124
+        run: python -m pip install torch --index-url https://download.pytorch.org/whl/cu128
 
       - name: Install torchhull
         run: python -m pip install --editable ".[dev]"
diff --git a/src/torchhull/_C/CMakeLists.txt b/src/torchhull/_C/CMakeLists.txt
@@ -42,8 +42,8 @@ if(NOT TARGET stdgpu::stdgpu)
     FetchContent_Declare(
         stdgpu
         PREFIX stdgpu
-        URL https://github.com/stotko/stdgpu/archive/3a0b20e77a5eac672162fa5f6173ce9a34303d7f.tar.gz
-        URL_HASH SHA256=4723bba67ccb67f3a0218515f555c4ed385ae2f638cf668b81d6d490c1f47fbc
+        URL https://github.com/stotko/stdgpu/archive/abc7d0523c9921227c90bdadbb24d4a17e35de61.tar.gz
+        URL_HASH SHA256=35aaf97a9d63817464c83020735e8761ad2ac64bef5c45e5e6b90601619b4fb5
         DOWNLOAD_DIR "${CMAKE_BINARY_DIR}/external/stdgpu"
         SYSTEM
     )
@@ -57,6 +57,9 @@ if(NOT TARGET stdgpu::stdgpu)
     set(STDGPU_BUILD_TESTS OFF CACHE INTERNAL "")
 
     FetchContent_MakeAvailable(stdgpu)
+
+    find_package(CUDAToolkit REQUIRED)
+    target_link_libraries(stdgpu PUBLIC CUDA::cudart_static)
 endif()
 
 
@@ -78,7 +81,7 @@ if(charonload_FOUND)
     target_compile_definitions(torchhull_cpp PRIVATE "__CUDA_NO_HALF_OPERATORS__")
     target_compile_features(torchhull_cpp PUBLIC cxx_std_17)
     target_compile_options(torchhull_cpp PRIVATE ${HOST_DEVICE_FLAGS})
-    target_link_libraries(torchhull_cpp PRIVATE glm::glm stdgpu::stdgpu)
+    target_link_libraries(torchhull_cpp PRIVATE glm::glm-header-only stdgpu::stdgpu)
 
 
     if(TORCHHULL_BUILD_BINDINGS)
diff --git a/src/torchhull/_C/src/gaussian_blur_cuda.cu b/src/torchhull/_C/src/gaussian_blur_cuda.cu
@@ -447,6 +447,20 @@ gaussian_blur_cuda_sparse(const torch::Tensor& images,
     dim3 grid_convolution;
     at::cuda::getApplyGrid(M, grid_convolution, images.device().index(), threads_per_block);
 
+#define CASE_TILE_CONVOLUTION_KERNEL_SPECIALIZED(KERNEL_SIZE)                                                          \
+    case KERNEL_SIZE:                                                                                                  \
+    {                                                                                                                  \
+        tile_convolution_kernel_specialized<KERNEL_SIZE><<<grid_convolution, threads, 0, stream>>>(tile_indices_,      \
+                                                                                                   M,                  \
+                                                                                                   tile_size,          \
+                                                                                                   sigma,              \
+                                                                                                   images_,            \
+                                                                                                   blurred_images_);   \
+        DEFER(AT_CUDA_CHECK(cudaGetLastError());)                                                                      \
+        DEFER(AT_CUDA_CHECK(cudaStreamSynchronize(stream));)                                                           \
+    }                                                                                                                  \
+    break;
+
     AT_DISPATCH_ALL_TYPES_AND(
             torch::ScalarType::Half,
             images.scalar_type(),
@@ -464,20 +478,6 @@ gaussian_blur_cuda_sparse(const torch::Tensor& images,
                             auto blurred_images_ =
                                     blurred_images.packed_accessor64<scalar_t, 4, torch::RestrictPtrTraits>();
 
-#define CASE_TILE_CONVOLUTION_KERNEL_SPECIALIZED(KERNEL_SIZE)                                                          \
-    case KERNEL_SIZE:                                                                                                  \
-    {                                                                                                                  \
-        tile_convolution_kernel_specialized<KERNEL_SIZE><<<grid_convolution, threads, 0, stream>>>(tile_indices_,      \
-                                                                                                   M,                  \
-                                                                                                   tile_size,          \
-                                                                                                   sigma,              \
-                                                                                                   images_,            \
-                                                                                                   blurred_images_);   \
-        DEFER(AT_CUDA_CHECK(cudaGetLastError());)                                                                      \
-        DEFER(AT_CUDA_CHECK(cudaStreamSynchronize(stream));)                                                           \
-    }                                                                                                                  \
-    break;
-
                             switch (kernel_size)
                             {
                                 // Tested all possible values up to 21 for specialization.