Skip to content

sampling: Port of Smooth Sampling / Quadratic Sampling support #13441

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
2669c15
Initial commit
Silver267 May 10, 2025
402ea4f
Merge branch 'ggml-org:master' into master
Silver267 May 10, 2025
fecd48a
Decouple smoothing from temp_ext
Silver267 May 11, 2025
0d7053b
Revert changes to the implementation of temp_ext
Silver267 May 11, 2025
73b8583
Merge branch 'ggml-org:master' into master
Silver267 May 11, 2025
ab85a84
Added docs
Silver267 May 11, 2025
9f7c0d2
Webui stuff
Silver267 May 11, 2025
77c1d3a
Merge branch 'ggml-org:master' into master
Silver267 May 11, 2025
60a537f
Merge branch 'ggml-org:master' into master
Silver267 May 13, 2025
e2b9c0b
vulkan: workaround FA compile failures on macos (#13517)
jeffbolznv May 14, 2025
b3cade6
scripts : fix compare-llama-bench.py show parameter (#13514)
CISC May 14, 2025
5cf54f3
docs: Update link to ggml-org in multimodal.md (#13513)
ddpasa May 14, 2025
453aedf
webui: Allow pasting file from clipboard (#13526)
luca020400 May 14, 2025
a0df391
webui : use fflate for more deterministic gzip compress (#13525)
ngxson May 14, 2025
f30f486
vulkan: KHR_coopmat flash attention (#13506)
jeffbolznv May 14, 2025
1fa821d
cmake: simplify vulkan shader test logic (#13263)
bandoti May 14, 2025
3be533d
server : fix cache_tokens bug with no cache_prompt (#13533)
ngxson May 14, 2025
5c0677a
server : passthrough the /models endpoint during loading (#13535)
ggerganov May 14, 2025
f7d9eb2
fix: Move build_inp_pos to the top of the graph section for build_gra…
gabe-l-hart May 14, 2025
8553934
CUDA: faster Deepseek FA, add Turing support (#13435)
JohannesGaessler May 14, 2025
fbbf620
llama : fix quantize with dl backends (#13539)
slaren May 14, 2025
60526e3
CUDA: fix crash on large batch size for quant. MoE (#13537)
JohannesGaessler May 14, 2025
a65d613
fix: crash when calling `llama_state_get_size` on a context without a…
giladgd May 14, 2025
95f0e96
editorconfig : fix trailing whitespace from #13542 (#13546)
CISC May 14, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1760,6 +1760,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.sampling.xtc_threshold = std::stof(value);
}
).set_sparam());
add_opt(common_arg(
{"--smoothing-factor"}, "N",
string_format("smoothing factor (default: %.1f, 0.0 = disabled)", (double)params.sampling.smoothing_factor),
[](common_params & params, const std::string & value) {
params.sampling.smoothing_factor = std::stof(value);
}
).set_sparam());
add_opt(common_arg(
{"--smoothing-curve"}, "N",
string_format("smoothing curve (default: %.1f, 1.0 = disabled)", (double)params.sampling.smoothing_curve),
[](common_params & params, const std::string & value) {
params.sampling.smoothing_curve = std::stof(value);
}
).set_sparam());
add_opt(common_arg(
{"--typical"}, "N",
string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),
Expand Down
4 changes: 4 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@ enum common_sampler_type {
COMMON_SAMPLER_TYPE_INFILL = 9,
COMMON_SAMPLER_TYPE_PENALTIES = 10,
COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
COMMON_SAMPLER_TYPE_SMOOTHING = 12,
};

// dimensionality reduction methods, used by cvector-generator
Expand Down Expand Up @@ -135,6 +136,8 @@ struct common_params_sampling {
float min_p = 0.05f; // 0.0 = disabled
float xtc_probability = 0.00f; // 0.0 = disabled
float xtc_threshold = 0.10f; // > 0.5 disables XTC
float smoothing_factor = 0.0f; // controls the quadratic adjustment in smooth / quadratic sampling (0.0 = disabled)
float smoothing_curve = 1.0f; // controls the cubic transformation curve for smoothing / quadratic sampling.
float typ_p = 1.00f; // typical_p, 1.0 = disabled
float temp = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
float dynatemp_range = 0.00f; // 0.0 = disabled
Expand Down Expand Up @@ -167,6 +170,7 @@ struct common_params_sampling {
COMMON_SAMPLER_TYPE_TOP_P,
COMMON_SAMPLER_TYPE_MIN_P,
COMMON_SAMPLER_TYPE_XTC,
COMMON_SAMPLER_TYPE_SMOOTHING,
COMMON_SAMPLER_TYPE_TEMPERATURE,
};

Expand Down
12 changes: 10 additions & 2 deletions common/sampling.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,11 +136,11 @@ std::string common_params_sampling::print() const {
snprintf(result, sizeof(result),
"\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
"\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
"\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, smoothing_factor = %.3f, smoothing_curve = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
"\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
top_k, top_p, min_p, xtc_probability, xtc_threshold, smoothing_factor, smoothing_curve, typ_p, top_n_sigma, temp,
mirostat, mirostat_eta, mirostat_tau);

return std::string(result);
Expand Down Expand Up @@ -258,6 +258,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
case COMMON_SAMPLER_TYPE_XTC:
llama_sampler_chain_add(result->chain, llama_sampler_init_xtc (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
break;
case COMMON_SAMPLER_TYPE_SMOOTHING:
llama_sampler_chain_add(result->chain, llama_sampler_init_smoothing (params.smoothing_factor, params.smoothing_curve));
break;
case COMMON_SAMPLER_TYPE_TYPICAL_P:
llama_sampler_chain_add(result->chain, llama_sampler_init_typical (params.typ_p, params.min_keep));
break;
Expand Down Expand Up @@ -479,6 +482,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_XTC: return 'x';
case COMMON_SAMPLER_TYPE_INFILL: return 'i';
case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
case COMMON_SAMPLER_TYPE_SMOOTHING: return 'q';
default : return '?';
}
}
Expand All @@ -495,6 +499,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
case COMMON_SAMPLER_TYPE_XTC: return "xtc";
case COMMON_SAMPLER_TYPE_INFILL: return "infill";
case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
case COMMON_SAMPLER_TYPE_SMOOTHING: return "smoothing";
default : return "";
}
}
Expand All @@ -509,6 +514,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "min_p", COMMON_SAMPLER_TYPE_MIN_P },
{ "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
{ "xtc", COMMON_SAMPLER_TYPE_XTC },
{ "smoothing", COMMON_SAMPLER_TYPE_SMOOTHING},
{ "infill", COMMON_SAMPLER_TYPE_INFILL },
{ "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
};
Expand All @@ -525,6 +531,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
{ "typ-p", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "typ", COMMON_SAMPLER_TYPE_TYPICAL_P },
{ "min-p", COMMON_SAMPLER_TYPE_MIN_P },
{ "quadratic", COMMON_SAMPLER_TYPE_SMOOTHING},
{ "temp", COMMON_SAMPLER_TYPE_TEMPERATURE },
};

Expand Down Expand Up @@ -560,6 +567,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P), COMMON_SAMPLER_TYPE_MIN_P },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_SMOOTHING), COMMON_SAMPLER_TYPE_SMOOTHING},
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
{ common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
};
Expand Down
2 changes: 1 addition & 1 deletion docs/multimodal.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload

## Pre-quantized models

These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org

Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`

Expand Down
18 changes: 13 additions & 5 deletions ggml/src/ggml-cuda/fattn-common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -678,17 +678,25 @@ void launch_fattn(
) {
constexpr int ncols = ncols1 * ncols2;

const bool is_mla = DV == 512; // TODO better parameterization

const ggml_tensor * Q = dst->src[0];
const ggml_tensor * K = dst->src[1];
const ggml_tensor * V = dst->src[2];

GGML_ASSERT(V || is_mla);

const ggml_tensor * mask = dst->src[3];

ggml_tensor * KQV = dst;

GGML_ASSERT(Q->type == GGML_TYPE_F32);
GGML_ASSERT(KQV->type == GGML_TYPE_F32);

GGML_ASSERT( Q->nb[0] == ggml_element_size(Q));
GGML_ASSERT( K->nb[0] == ggml_element_size(K));
GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));

GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
"the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
Expand All @@ -713,10 +721,10 @@ void launch_fattn(
size_t nb12 = K->nb[2];
size_t nb13 = K->nb[3];

const char * V_data = (const char *) V->data;
size_t nb21 = V->nb[1];
size_t nb22 = V->nb[2];
size_t nb23 = V->nb[3];
const char * V_data = V ? (const char *) V->data : nullptr;
size_t nb21 = V ? V->nb[1] : nb11;
size_t nb22 = V ? V->nb[2] : nb12;
size_t nb23 = V ? V->nb[3] : nb13;

if (need_f16_K && K->type != GGML_TYPE_F16) {
GGML_ASSERT(ggml_is_contiguously_allocated(K));
Expand All @@ -733,7 +741,7 @@ void launch_fattn(
nb13 = nb13*bs*sizeof(half)/ts;
}

if (need_f16_V && V->type != GGML_TYPE_F16) {
if (V && need_f16_V && V->type != GGML_TYPE_F16) {
GGML_ASSERT(ggml_is_contiguously_allocated(V));
V_f16.alloc(ggml_nelements(V));
to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
Expand Down
Loading