diff --git a/dflash/CMakeLists.txt b/dflash/CMakeLists.txt index c5ea80dd..ffae33e9 100644 --- a/dflash/CMakeLists.txt +++ b/dflash/CMakeLists.txt @@ -44,7 +44,36 @@ endif() # the spec_prefill demo (target_gen path uses standard quant pairs). option(DFLASH27B_FA_ALL_QUANTS "Compile ggml-cuda fattn kernels for all KV-quant pairs" ON) set(GGML_CUDA_FA_ALL_QUANTS ${DFLASH27B_FA_ALL_QUANTS} CACHE BOOL "" FORCE) + # Use only the ggml subtree of llama.cpp (skip libllama). + +# ─── CUDA architecture auto-detection ────────────────────────────── +# CUDA 12.8+ resolves "native" on Blackwell to sm_120a (DGX Spark variant). +# Query nvidia-smi at configure time to get the exact SM and set +# CMAKE_CUDA_ARCHITECTURES explicitly so ggml-cuda compiles for the +# detected physical GPU. +if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "native") + find_program(_DFLASH_NVIDIA_SMI nvidia-smi) + if(_DFLASH_NVIDIA_SMI) + execute_process( + COMMAND "${_DFLASH_NVIDIA_SMI}" + --query-gpu=compute_cap --format=csv,noheader + OUTPUT_VARIABLE _dflash_gpu_caps + ERROR_QUIET + OUTPUT_STRIP_TRAILING_WHITESPACE + RESULT_VARIABLE _dflash_smi_rc + ) + if(_dflash_smi_rc EQUAL 0 AND _dflash_gpu_caps) + string(REGEX MATCH "^[0-9]+\\.[0-9]+" _dflash_cap "${_dflash_gpu_caps}") + string(REPLACE "." "" _dflash_arch "${_dflash_cap}") + if(_dflash_arch) + set(CMAKE_CUDA_ARCHITECTURES "${_dflash_arch}" CACHE STRING + "CUDA architectures (auto-detected from nvidia-smi: ${_dflash_cap})" FORCE) + message(STATUS "dflash27b: GPU compute_cap ${_dflash_cap} → CUDA_ARCHITECTURES=${_dflash_arch}") + endif() + endif() + endif() +endif() add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL) # The C++ sources include directly, so the toolkit headers must