Luce-Org · easel · May 4, 2026
diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt
@@ -44,7 +44,36 @@ endif()
 # the spec_prefill demo (target_gen path uses standard quant pairs).
 option(DFLASH27B_FA_ALL_QUANTS "Compile ggml-cuda fattn kernels for all KV-quant pairs" ON)
 set(GGML_CUDA_FA_ALL_QUANTS ${DFLASH27B_FA_ALL_QUANTS} CACHE BOOL "" FORCE)
+
 # Use only the ggml subtree of llama.cpp (skip libllama).
+
+# ─── CUDA architecture auto-detection ──────────────────────────────
+# CUDA 12.8+ resolves "native" on Blackwell to sm_120a (DGX Spark variant).
+# Query nvidia-smi at configure time to get the exact SM and set
+# CMAKE_CUDA_ARCHITECTURES explicitly so ggml-cuda compiles for the
+# detected physical GPU.
+if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
+    find_program(_DFLASH_NVIDIA_SMI nvidia-smi)
+    if(_DFLASH_NVIDIA_SMI)
+        execute_process(
+            COMMAND "${_DFLASH_NVIDIA_SMI}"
+                    --query-gpu=compute_cap --format=csv,noheader
+            OUTPUT_VARIABLE _dflash_gpu_caps
+            ERROR_QUIET
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+            RESULT_VARIABLE _dflash_smi_rc
+        )
+        if(_dflash_smi_rc EQUAL 0 AND _dflash_gpu_caps)
+            string(REGEX MATCH "^[0-9]+\\.[0-9]+" _dflash_cap "${_dflash_gpu_caps}")
+            string(REPLACE "." "" _dflash_arch "${_dflash_cap}")
+            if(_dflash_arch)
+                set(CMAKE_CUDA_ARCHITECTURES "${_dflash_arch}" CACHE STRING
+                    "CUDA architectures (auto-detected from nvidia-smi: ${_dflash_cap})" FORCE)
+                message(STATUS "dflash27b: GPU compute_cap ${_dflash_cap} → CUDA_ARCHITECTURES=${_dflash_arch}")
+            endif()
+        endif()
+    endif()
+endif()
 add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL)
 
 # The C++ sources include <cuda_runtime.h> directly, so the toolkit headers must