ggml-org · Silver267 · May 10, 2025 · May 10, 2025 · May 11, 2025 · May 11, 2025
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -1760,6 +1760,20 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.sampling.xtc_threshold = std::stof(value);
         }
     ).set_sparam());
+    add_opt(common_arg(
+        {"--smoothing-factor"}, "N",
+        string_format("smoothing factor (default: %.1f, 0.0 = disabled)", (double)params.sampling.smoothing_factor),
+        [](common_params & params, const std::string & value) {
+            params.sampling.smoothing_factor = std::stof(value);
+        }
+    ).set_sparam());
+    add_opt(common_arg(
+        {"--smoothing-curve"}, "N",
+        string_format("smoothing curve (default: %.1f, 1.0 = disabled)", (double)params.sampling.smoothing_curve),
+        [](common_params & params, const std::string & value) {
+            params.sampling.smoothing_curve = std::stof(value);
+        }
+    ).set_sparam());
     add_opt(common_arg(
         {"--typical"}, "N",
         string_format("locally typical sampling, parameter p (default: %.1f, 1.0 = disabled)", (double)params.sampling.typ_p),

diff --git a/common/common.h b/common/common.h
@@ -96,6 +96,7 @@ enum common_sampler_type {
     COMMON_SAMPLER_TYPE_INFILL      = 9,
     COMMON_SAMPLER_TYPE_PENALTIES   = 10,
     COMMON_SAMPLER_TYPE_TOP_N_SIGMA = 11,
+    COMMON_SAMPLER_TYPE_SMOOTHING   = 12,
 };
 
 // dimensionality reduction methods, used by cvector-generator
@@ -135,6 +136,8 @@ struct common_params_sampling {
     float   min_p              = 0.05f; // 0.0 = disabled
     float   xtc_probability    = 0.00f; // 0.0 = disabled
     float   xtc_threshold      = 0.10f; // > 0.5 disables XTC
+    float   smoothing_factor   = 0.0f;  // controls the quadratic adjustment in smooth / quadratic sampling (0.0 = disabled)
+    float   smoothing_curve    = 1.0f;  // controls the cubic transformation curve for smoothing / quadratic sampling.
     float   typ_p              = 1.00f; // typical_p, 1.0 = disabled
     float   temp               = 0.80f; // <= 0.0 to sample greedily, 0.0 to not output probabilities
     float   dynatemp_range     = 0.00f; // 0.0 = disabled
@@ -167,6 +170,7 @@ struct common_params_sampling {
         COMMON_SAMPLER_TYPE_TOP_P,
         COMMON_SAMPLER_TYPE_MIN_P,
         COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_SMOOTHING,
         COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
 

diff --git a/common/sampling.cpp b/common/sampling.cpp
@@ -136,11 +136,11 @@ std::string common_params_sampling::print() const {
     snprintf(result, sizeof(result),
             "\trepeat_last_n = %d, repeat_penalty = %.3f, frequency_penalty = %.3f, presence_penalty = %.3f\n"
             "\tdry_multiplier = %.3f, dry_base = %.3f, dry_allowed_length = %d, dry_penalty_last_n = %d\n"
-            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
+            "\ttop_k = %d, top_p = %.3f, min_p = %.3f, xtc_probability = %.3f, xtc_threshold = %.3f, smoothing_factor = %.3f, smoothing_curve = %.3f, typical_p = %.3f, top_n_sigma = %.3f, temp = %.3f\n"
             "\tmirostat = %d, mirostat_lr = %.3f, mirostat_ent = %.3f",
             penalty_last_n, penalty_repeat, penalty_freq, penalty_present,
             dry_multiplier, dry_base, dry_allowed_length, dry_penalty_last_n,
-            top_k, top_p, min_p, xtc_probability, xtc_threshold, typ_p, top_n_sigma, temp,
+            top_k, top_p, min_p, xtc_probability, xtc_threshold, smoothing_factor, smoothing_curve, typ_p, top_n_sigma, temp,
             mirostat, mirostat_eta, mirostat_tau);
 
     return std::string(result);
@@ -258,6 +258,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
                 case COMMON_SAMPLER_TYPE_XTC:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_xtc         (params.xtc_probability, params.xtc_threshold, params.min_keep, params.seed));
                     break;
+                case COMMON_SAMPLER_TYPE_SMOOTHING:
+                    llama_sampler_chain_add(result->chain, llama_sampler_init_smoothing   (params.smoothing_factor, params.smoothing_curve));
+                    break;
                 case COMMON_SAMPLER_TYPE_TYPICAL_P:
                     llama_sampler_chain_add(result->chain, llama_sampler_init_typical     (params.typ_p, params.min_keep));
                     break;
@@ -479,6 +482,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_XTC:         return 'x';
         case COMMON_SAMPLER_TYPE_INFILL:      return 'i';
         case COMMON_SAMPLER_TYPE_PENALTIES:   return 'e';
+        case COMMON_SAMPLER_TYPE_SMOOTHING:   return 'q';
         default : return '?';
     }
 }
@@ -495,6 +499,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
         case COMMON_SAMPLER_TYPE_XTC:         return "xtc";
         case COMMON_SAMPLER_TYPE_INFILL:      return "infill";
         case COMMON_SAMPLER_TYPE_PENALTIES:   return "penalties";
+        case COMMON_SAMPLER_TYPE_SMOOTHING:   return "smoothing";
         default : return "";
     }
 }
@@ -509,6 +514,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "min_p",       COMMON_SAMPLER_TYPE_MIN_P },
         { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
         { "xtc",         COMMON_SAMPLER_TYPE_XTC },
+        { "smoothing",   COMMON_SAMPLER_TYPE_SMOOTHING},
         { "infill",      COMMON_SAMPLER_TYPE_INFILL },
         { "penalties",   COMMON_SAMPLER_TYPE_PENALTIES },
     };
@@ -525,6 +531,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
         { "typ-p",       COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "typ",         COMMON_SAMPLER_TYPE_TYPICAL_P },
         { "min-p",       COMMON_SAMPLER_TYPE_MIN_P },
+        { "quadratic",   COMMON_SAMPLER_TYPE_SMOOTHING},
         { "temp",        COMMON_SAMPLER_TYPE_TEMPERATURE },
     };
 
@@ -560,6 +567,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_MIN_P),       COMMON_SAMPLER_TYPE_MIN_P },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC),         COMMON_SAMPLER_TYPE_XTC },
+        { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_SMOOTHING),   COMMON_SAMPLER_TYPE_SMOOTHING},
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL),      COMMON_SAMPLER_TYPE_INFILL },
         { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES),   COMMON_SAMPLER_TYPE_PENALTIES },
     };

diff --git a/docs/multimodal.md b/docs/multimodal.md
@@ -31,7 +31,7 @@ llama-server -hf ggml-org/gemma-3-4b-it-GGUF --no-mmproj-offload
 
 ## Pre-quantized models
 
-These are ready-to-use models, most of them come with `Q4_K_M` quantization by default.
+These are ready-to-use models, most of them come with `Q4_K_M` quantization by default. They can be found at the Hugging Face page of the ggml-org: https://huggingface.co/ggml-org
 
 Replaces the `(tool_name)` with the name of binary you want to use. For example, `llama-mtmd-cli` or `llama-server`
 

@@ -678,17 +678,25 @@ void launch_fattn(
 ) {
     constexpr int ncols = ncols1 * ncols2;
 
+    const bool is_mla = DV == 512; // TODO better parameterization
+
     const ggml_tensor * Q = dst->src[0];
     const ggml_tensor * K = dst->src[1];
     const ggml_tensor * V = dst->src[2];
 
+    GGML_ASSERT(V || is_mla);
+
     const ggml_tensor * mask = dst->src[3];
 
     ggml_tensor * KQV = dst;
 
     GGML_ASSERT(Q->type == GGML_TYPE_F32);
     GGML_ASSERT(KQV->type == GGML_TYPE_F32);
 
+    GGML_ASSERT(      Q->nb[0] == ggml_element_size(Q));
+    GGML_ASSERT(      K->nb[0] == ggml_element_size(K));
+    GGML_ASSERT(!V || V->nb[0] == ggml_element_size(V));
+
     GGML_ASSERT(!mask || mask->type == GGML_TYPE_F16);
     GGML_ASSERT(!mask || mask->ne[1] >= GGML_PAD(Q->ne[1], 16) &&
         "the Flash-Attention CUDA kernel requires the mask to be padded to 16 and at least n_queries big");
@@ -713,10 +721,10 @@ void launch_fattn(
     size_t nb12 = K->nb[2];
     size_t nb13 = K->nb[3];
 
-    const char * V_data = (const char *) V->data;
-    size_t nb21 = V->nb[1];
-    size_t nb22 = V->nb[2];
-    size_t nb23 = V->nb[3];
+    const char * V_data = V ? (const char *) V->data : nullptr;
+    size_t nb21 = V ? V->nb[1] : nb11;
+    size_t nb22 = V ? V->nb[2] : nb12;
+    size_t nb23 = V ? V->nb[3] : nb13;
 
     if (need_f16_K && K->type != GGML_TYPE_F16) {
         GGML_ASSERT(ggml_is_contiguously_allocated(K));
@@ -733,7 +741,7 @@ void launch_fattn(
         nb13 = nb13*bs*sizeof(half)/ts;
     }
 
-    if (need_f16_V && V->type != GGML_TYPE_F16) {
+    if (V && need_f16_V && V->type != GGML_TYPE_F16) {
         GGML_ASSERT(ggml_is_contiguously_allocated(V));
         V_f16.alloc(ggml_nelements(V));
         to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);