Allow scaling seqrep penalty for mid-word tokens

KerfuffleV2 · KerfuffleV2 · commit cb02274a3c32 · 2023-08-14T05:01:43.000-06:00
diff --git a/examples/common.cpp b/examples/common.cpp
@@ -280,6 +280,12 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.seqrep_lpenalty = std::stof(argv[i]);
+        } else if (arg == "--seqrep-mw-scale") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.seqrep_mw_scale = std::stof(argv[i]);
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -591,6 +597,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stdout, "  --seqrep-tolerance N  tolerance for fuzzy matching sequences (default: %d, 0 = disabled)\n", params.seqrep_tolerance);
     fprintf(stdout, "  --seqrep-ppenalty N   presence penalty for tokens that can continue a sequence (default: %f, 0.0 = disabled)\n", params.seqrep_ppenalty);
     fprintf(stdout, "  --seqrep-lpenalty N   penalty for tokens that can continue a sequence, multiplied by length (default: %f, 0.0 = disabled)\n", params.seqrep_lpenalty);
+    fprintf(stdout, "  --seqrep-mw-scale N   scale penalty when for mid-word tokens. 1.0 would mean apply the full penalty (default: %f, 1.0 = disabled)\n", params.seqrep_mw_scale);
     fprintf(stdout, "  --mirostat N          use Mirostat sampling.\n");
     fprintf(stdout, "                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
     fprintf(stdout, "                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
diff --git a/examples/common.h b/examples/common.h
@@ -49,6 +49,7 @@ struct gpt_params {
     int32_t seqrep_tolerance  = 0;     // tolerance for fuzzy sequence matching (0 = disabled)
     float   seqrep_ppenalty   = 0.0f;  // flat penalty (0.0 = disabled)
     float   seqrep_lpenalty   = 0.0f;  // stacking penalty based on length (0.0 = disabled)
+    float   seqrep_mw_scale   = 0.1f;  // scale penalty when applied to mid-word tokens (1.0 = apply full penalty)
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -334,9 +334,9 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "Input suffix: '%s'\n", params.input_suffix.c_str());
         }
     }
-    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, seqrep(last_n = %d, min_len = %d, tolerance = %d, ppenalty = %f, lpenalty = %f), top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
+    fprintf(stderr, "sampling: repeat_last_n = %d, repeat_penalty = %f, presence_penalty = %f, frequency_penalty = %f, seqrep(last_n = %d, min_len = %d, tolerance = %d, ppenalty = %f, lpenalty = %f, mw_scale = %f), top_k = %d, tfs_z = %f, top_p = %f, typical_p = %f, temp = %f, mirostat = %d, mirostat_lr = %f, mirostat_ent = %f\n",
             params.repeat_last_n, params.repeat_penalty, params.presence_penalty, params.frequency_penalty,
-            params.seqrep_last_n, params.seqrep_min_len, params.seqrep_tolerance, params.seqrep_ppenalty, params.seqrep_lpenalty,
+            params.seqrep_last_n, params.seqrep_min_len, params.seqrep_tolerance, params.seqrep_ppenalty, params.seqrep_lpenalty, params.seqrep_mw_scale,
             params.top_k, params.tfs_z, params.top_p, params.typical_p, params.temp, params.mirostat, params.mirostat_eta, params.mirostat_tau);
     fprintf(stderr, "generate: n_ctx = %d, n_batch = %d, n_predict = %d, n_keep = %d\n", n_ctx, params.n_batch, params.n_predict, params.n_keep);
     fprintf(stderr, "\n\n");
@@ -604,7 +604,7 @@ int main(int argc, char ** argv) {
                 llama_sample_seqrep_penalty(ctx, &candidates_p,
                     last_n_tokens.data() + last_n_tokens.size() - seqrep_last_n_repeat,
                     seqrep_last_n_repeat, params.seqrep_min_len, params.seqrep_tolerance,
-                    params.seqrep_ppenalty, params.seqrep_lpenalty);
+                    params.seqrep_ppenalty, params.seqrep_lpenalty, params.seqrep_mw_scale);
                 if (!penalize_nl) {
                     logits[llama_token_nl()] = nl_logit;
                 }
diff --git a/llama.cpp b/llama.cpp
@@ -42,6 +42,7 @@
 #include <queue>
 #include <cassert>
 #include <cstring>
+#include <cctype>
 #include <climits>
 #include <memory>
 #include <algorithm>
@@ -2690,11 +2691,44 @@ static size_t llama_seqrep_find_match(const llama_token * last_tokens_p, const s
     return matches;
 }
 
-void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, size_t min_length, size_t tolerance, float flat_penalty, float length_penalty) {
+// Internal helper function for sequence matching.
+// Bit 1 set indicates token is a word boundary. NL, " blah", "," - word boundary. "blah", "blah:" - not a word boundary.
+// Bit 2 set indicates token ends on a word boundary. NL, "blah:", "blah " - ends on word boundary. " blah", "blah" - doesn't end on word boundary.
+// Errata: UTF8 safe but only considers ASCII characters. ASCII single quote is treated as a non-boundary which isn't always correct.
+static uint8_t llama_seqrep_check_word(struct llama_context * ctx, const llama_token token) {
+    if (token == llama_token_bos() || token == llama_token_eos() || token == llama_token_nl()) {
+        // BOS, EOS, NL are always a boundary.
+        return 3;
+    }
+    const char * token_str = llama_token_to_str(ctx, token);
+    assert(token_str != NULL);
+    if (token_str[0] == '\0') {
+        // 0-length token string, can't be a boundary.
+        return 0;
+    }
+
+    const char start_char = token_str[0];
+    char end_char;
+    for (const char *curr_char = token_str; ; curr_char++) {
+        // Guaranteed to iterate at least once since we already checked if the string was 0-length.
+        if (*(curr_char + 1) == '\0') {
+            end_char = *curr_char;
+            break;
+        }
+    }
+    return uint8_t(
+        (start_char != '\'' && !isalnum((int)start_char) ? 1 : 0) +
+        (end_char != '\'' && !isalnum((int)end_char) ? 2 : 0)
+    );
+
+}
+
+void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, size_t min_length, size_t tolerance, float flat_penalty, float length_penalty, float mid_word_scale) {
     if (min_length < 2 || last_tokens_size <= min_length ||
         (flat_penalty == 0.0f && length_penalty == 0.0f)) {
         return;
     }
+    assert(ctx);
 
     const int64_t t_start_sample_us = ggml_time_us();
 
@@ -2719,9 +2753,14 @@ void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_ar
             penalize_tokens[penalize_token] = pt_iter->second + matched_length;
         }
     }
+
+    const bool ends_on_word = (llama_seqrep_check_word(ctx, last_tokens_p[last_tokens_size - 1]) & 2) != 0;
+
     for (const auto it : penalize_tokens) {
+        const bool pt_starts_word = (llama_seqrep_check_word(ctx, it.first) & 1) != 0;
         candidates->data[it.first].logit -=
-            float(it.second) * length_penalty + float(it.second > 0) * flat_penalty;
+            (float(it.second) * length_penalty + float(it.second > 0) * flat_penalty)
+            * (ends_on_word || pt_starts_word ? 1.0f : mid_word_scale);
     }
 
     if (ctx) {
diff --git a/llama.h b/llama.h
@@ -412,7 +412,14 @@ extern "C" {
     /// @params tolerance Tolerance for non-matching tokens in a sequence.
     /// @params flat_penalty Flat penalty applied to the token that can continue a repeated sequence.
     /// @params stacking_penalty Scaling penalty applied to the token that can continue a repeated sequence. The penalty is multiplied by the total length of sequences that are continued by this token.
-    LLAMA_API void llama_sample_seqrep_penalty(struct llama_context * ctx, llama_token_data_array * candidates, const llama_token * last_tokens_p, size_t last_tokens_size, size_t min_length, size_t tolerance, float flat_penalty, float length_penalty);
+    /// @params mid_word_scale Scale for penalizing tokens from repeated sequences that aren't at/form a word boundary.
+    LLAMA_API void llama_sample_seqrep_penalty(
+            struct llama_context * ctx,
+            llama_token_data_array * candidates,
+            const llama_token * last_tokens_p, size_t last_tokens_size,
+            size_t min_length, size_t tolerance,
+            float flat_penalty, float length_penalty,
+            float mid_word_scale);
 
     /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806
     /// @param candidates A vector of `llama_token_data` containing the candidate tokens, the logits must be directly extracted from the original generation context without being sorted.