Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions dflash/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,36 @@ endif()
# the spec_prefill demo (target_gen path uses standard quant pairs).
option(DFLASH27B_FA_ALL_QUANTS "Compile ggml-cuda fattn kernels for all KV-quant pairs" ON)
set(GGML_CUDA_FA_ALL_QUANTS ${DFLASH27B_FA_ALL_QUANTS} CACHE BOOL "" FORCE)

# Use only the ggml subtree of llama.cpp (skip libllama).

# ─── CUDA architecture auto-detection ──────────────────────────────
# CUDA 12.8+ resolves "native" on Blackwell to sm_120a (DGX Spark variant).
# Query nvidia-smi at configure time to get the exact SM and set
# CMAKE_CUDA_ARCHITECTURES explicitly so ggml-cuda compiles for the
# detected physical GPU.
if(NOT CMAKE_CUDA_ARCHITECTURES OR CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
find_program(_DFLASH_NVIDIA_SMI nvidia-smi)
if(_DFLASH_NVIDIA_SMI)
execute_process(
COMMAND "${_DFLASH_NVIDIA_SMI}"
--query-gpu=compute_cap --format=csv,noheader
OUTPUT_VARIABLE _dflash_gpu_caps
ERROR_QUIET
OUTPUT_STRIP_TRAILING_WHITESPACE
RESULT_VARIABLE _dflash_smi_rc
)
if(_dflash_smi_rc EQUAL 0 AND _dflash_gpu_caps)
string(REGEX MATCH "^[0-9]+\\.[0-9]+" _dflash_cap "${_dflash_gpu_caps}")
string(REPLACE "." "" _dflash_arch "${_dflash_cap}")
if(_dflash_arch)
set(CMAKE_CUDA_ARCHITECTURES "${_dflash_arch}" CACHE STRING
"CUDA architectures (auto-detected from nvidia-smi: ${_dflash_cap})" FORCE)
message(STATUS "dflash27b: GPU compute_cap ${_dflash_cap} → CUDA_ARCHITECTURES=${_dflash_arch}")
endif()
endif()
endif()
endif()
add_subdirectory(deps/llama.cpp/ggml EXCLUDE_FROM_ALL)

# The C++ sources include <cuda_runtime.h> directly, so the toolkit headers must
Expand Down