ggml-org
diff --git a/‎Makefile
Lines changed: 15 additions & 2 deletions b/‎Makefile
Lines changed: 15 additions & 2 deletions
diff --git a/‎common/common.cpp
Lines changed: 10 additions & 138 deletions b/‎common/common.cpp
Lines changed: 10 additions & 138 deletions
diff --git a/‎common/common.h
Lines changed: 6 additions & 4 deletions b/‎common/common.h
Lines changed: 6 additions & 4 deletions
@@ -177,6 +177,10 @@ ifdef LLAMA_DISABLE_LOGS
 	MK_CPPFLAGS += -DLOG_DISABLE_LOGS
 endif # LLAMA_DISABLE_LOGS
 
+ifdef LLAMA_DISABLE_SEQREP_SAMPLER
+	MK_CPPFLAGS += -DLLAMA_NO_SEQREP_SAMPLER
+endif
+
 # warnings
 MK_CFLAGS    += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith \
 				-Wmissing-prototypes -Werror=implicit-int -Wno-unused-function
@@ -476,7 +480,13 @@ OBJS += ggml-alloc.o
 llama.o: llama.cpp ggml.h ggml-alloc.h ggml-cuda.h ggml-metal.h llama.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-common.o: common/common.cpp common/common.h build-info.h common/log.h
+COMMON_DEPS = common/common.cpp common/common.h build-info.h common/log.h
+COMMON_OBJS = common.o
+ifndef LLAMA_DISABLE_SEQREP_SAMPLER
+COMMON_DEPS += common/seqrep-sampler.cpp common/seqrep-sampler.h
+COMMON_OBJS += seqrep-sampler.o
+endif
+common.o: $(COMMON_DEPS)
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
 console.o: common/console.cpp common/console.h
@@ -485,6 +495,9 @@ console.o: common/console.cpp common/console.h
 grammar-parser.o: common/grammar-parser.cpp common/grammar-parser.h
 	$(CXX) $(CXXFLAGS) -c $< -o $@
 
+seqrep-sampler.o: common/seqrep-sampler.cpp common/seqrep-sampler.h
+	$(CXX) $(CXXFLAGS) -c $< -o $@
+
 libllama.so: llama.o ggml.o $(OBJS)
 	$(CXX) $(CXXFLAGS) -shared -fPIC -o $@ $^ $(LDFLAGS)
 
@@ -495,7 +508,7 @@ clean:
 # Examples
 #
 
-main: examples/main/main.cpp                                  build-info.h ggml.o llama.o common.o console.o grammar-parser.o $(OBJS)
+main: examples/main/main.cpp                                  build-info.h ggml.o llama.o $(COMMON_OBJS) console.o grammar-parser.o $(OBJS)
 	$(CXX) $(CXXFLAGS) $(filter-out %.h,$^) -o $@ $(LDFLAGS)
 	@echo
 	@echo '====  Run ./main -h for help.  ===='
 
@@ -2,6 +2,10 @@
 #include "build-info.h"
 #include "llama.h"
 
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+#include "seqrep-sampler.h"
+#endif
+
 #include <algorithm>
 #include <cassert>
 #include <cmath>
@@ -102,144 +106,6 @@ void process_escapes(std::string& input) {
     input.resize(output_idx);
 }
 
-void seqrep_sampler_params_init(llama_sampler_seqrep_params * params) {
-    assert(params != NULL);
-    memset(params, 0, sizeof(llama_sampler_seqrep_params));
-    params->last_n = 256;
-    params->mid_word_scale = 0.1f;
-    params->tolerance_half_step_cost = 1.0f;
-}
-
-void seqrep_sampler_params_dump(const llama_sampler_seqrep_params * params) {
-    assert(params != NULL);
-    LOG_TEE("seqrep(last_n = %d, min_length = %zd, start_offset = %zd, presence_penalty = %.4f, length_penalty = %.4f, tolerance = %.4f, mid_word_scale = %.4f, tolerance_match_credit = %.4f, tolerance_half_step_cost = %.4f, flags = %d)\n",
-        params->last_n, params->min_length, params->start_offset, params->presence_penalty,
-        params->length_penalty, params->tolerance, params->mid_word_scale, params->tolerance_match_credit,
-        params->tolerance_half_step_cost, params->flags);
-}
-
-void seqrep_sampler_help() {
-    llama_sampler_seqrep_params p;
-    seqrep_sampler_params_init(&p);
-    fprintf(stdout, "==== Sequence Repetition Sampler Help ====\n\n");
-    fprintf(stdout, "  The sequence repetition sampler takes a configuration string in the format:\n");
-    fprintf(stdout, "  arg1:arg2:argN\n");
-    fprintf(stdout, "  A colon separated argument can be a key value pair like xyz=1 or flag like xyz\n");
-    fprintf(stdout, "\n- Available key/value arguments\n");
-    fprintf(stdout, "  * repetition_mode=REPEAT_PENALTY\n    emulates the repetition penalty sampler. warning: 1.0 disables penalties since this preset enables flag_divide_by_penalty. using 0.0 is probably not what you want\n");
-    fprintf(stdout, "  * presence_mode=PRESENCE_PENALTY\n    emulates the presence penalty sampler\n");
-    fprintf(stdout, "  * frequency_mode=FREQUENCY_PENALTY\n    Emulates the repetition penalty sampler\n");
-    fprintf(stdout, "  * last_n\n    last n tokens to consider for sequence penalizing (default: %d, 0 = disabled, -1 = ctx_size)\n", p.last_n);
-    fprintf(stdout, "  * min_length\n    minimum matching sequence length (default: %zd, < 2 = disabled)\n", p.min_length);
-    fprintf(stdout, "  * presence_penalty\n    presence penalty for tokens that can continue a sequence (default: %f, 0.0 = disabled)\n", p.presence_penalty);
-    fprintf(stdout, "  * length_penalty\n    penalty for tokens that can continue a sequence, multiplied by length (default: %f, 0.0 = disabled)\n", p.length_penalty);
-    fprintf(stdout, "  * tolerance\n    tolerance for fuzzy matching sequences (default: %f, 0 = disabled)\n", p.tolerance);
-    fprintf(stdout, "  * mid_word_scale\n    scale penalty when for mid-word tokens. 1.0 would mean apply the full penalty (default: %f, 1.0 = disabled)\n", p.mid_word_scale);
-    fprintf(stdout, "  * tolerance_match_credit\n    credit tolerance on matched tokens (default: %f, 0.0 = disabled)\n", p.tolerance_match_credit);
-    fprintf(stdout, "  * tolerance_half_step_cost\n    advanced option to adjust tolerance cost for failed matches within a half step of a match (default: %f, 1.0 = normal)\n", p.tolerance_half_step_cost);
-    fprintf(stdout, "\n- Available flags arguments (currently all default to disabled)\n");
-    fprintf(stdout, "  * flag_immediate_wildcard\n    when tolerance is consumed, by default it doesn't count as a match until a real match is found\n");
-    fprintf(stdout, "  * flag_tolerance_no_consecutive\n    do not allow using tolerance consecutively\n");
-    fprintf(stdout, "  * flag_tolerance_no_first\n    do not allow using tolerance before the first match\n");
-    fprintf(stdout, "  * flag_tolerance_cap_initial\n    only meaningful with match credit, prevents match credit adjusting tolerance higher than the initial value\n");
-    fprintf(stdout, "  * flag_penalize_length_max_seen\n    when applying length_penalty, use the maximum seen sequence length rather than the total length of seen sequences\n");
-    fprintf(stdout, "  * flag_divide_by_penalty\n    divide the logit when applying a penalty rather than subtracting it. warning: when this flag is enabled, 1.0 disables penalties not 0.0. 0.0 is probably not what you want\n");
-    fprintf(stdout, "\n- Examples:\n");
-    fprintf(stdout, "  * repetition_mode=1.2:last_n=32\n    same as --repeat-last-n 32 --repeat-penalty 1.2\n");
-    fprintf(stdout, "  * presence_mode=.2:last_n=32\n    same as --repeat-last-n 32 --presence-penalty .2\n");
-    fprintf(stdout, "  * frequency_mode=.2:last_n=32\n    same as --repeat-last-n 32 --frequency-penalty .2\n");
-    fprintf(stdout, "  * min_length=3:tolerance=1:length_penalty=.2:last_n=-1\n    match repeated sequences of at least 3 tokens within the entire context and apply a penalty of 0.2*total_length to the token that would continue the sequence. allow one non-matching token in matched sequences.\n");
-}
-
-bool seqrep_sampler_params_parse(char * s, llama_sampler_seqrep_params * params) {
-    assert(params != NULL);
-    assert(s != NULL);
-    size_t offset = 0;
-    std::string sparams = s;
-    size_t slen = sparams.size();
-
-    while (offset < slen) {
-        // printf("SR OFFS: %lu\n", offset);
-        size_t argsep = sparams.find_first_of(':', offset);
-        std::string argchunk;
-        if (argsep == std::string::npos) {
-            argchunk = sparams.substr(offset);
-        } else if (argsep > offset) {
-            argchunk = sparams.substr(offset, argsep - offset);
-        }
-        std::string argval;
-        size_t valsep = argchunk.find_first_of('=');
-        if (valsep != std::string::npos && valsep < argchunk.size()) {
-            argval = argchunk.substr(valsep + 1);
-            argchunk.resize(valsep);
-        }
-        // printf("SR: k[%s] = v[%s]\n", argchunk.c_str(), argval.c_str());
-        if (argchunk.empty() && argval.empty()) {
-            // pass
-        } else if (argchunk == "repetition_mode") {
-            params->last_n = 64;
-            params->min_length = 1;
-            params->mid_word_scale = 1.0f;
-            params->flags = LLAMA_SEQREP_DIVIDE_BY_PENALTY;
-            params->length_penalty = 1.0f;
-            params->presence_penalty = argval.empty() ? 1.1f : std::atof(argval.c_str());
-        } else if (argchunk == "presence_mode") {
-            params->last_n = 64;
-            params->min_length = 1;
-            params->mid_word_scale = 1.0f;
-            params->flags = 0;
-            params->length_penalty = 0.0f;
-            params->presence_penalty = std::atof(argval.c_str());
-        } else if (argchunk == "frequency_mode") {
-            params->last_n = 64;
-            params->min_length = 1;
-            params->mid_word_scale = 1.0f;
-            params->flags = 0;
-            params->length_penalty = std::atof(argval.c_str());
-            params->presence_penalty = 0.0f;
-        } else if (argchunk == "flag_immediate_wildcard") {
-            params->flags |= LLAMA_SEQREP_IMMEDIATE_WILDCARD;
-        } else if (argchunk == "flag_tolerance_no_consecutive") {
-            params->flags |= LLAMA_SEQREP_TOLERANCE_NO_CONSECUTIVE;
-        } else if (argchunk == "flag_tolerance_no_first") {
-            params->flags |= LLAMA_SEQREP_TOLERANCE_NO_FIRST;
-        } else if (argchunk == "flag_tolerance_cap_initial") {
-            params->flags |= LLAMA_SEQREP_TOLERANCE_CAP_INITIAL;
-        } else if (argchunk == "flag_penalize_length_max_seen") {
-            params->flags |= LLAMA_SEQREP_PENALIZE_LENGTH_MAX_SEEN;
-        } else if (argchunk == "flag_divide_by_penalty") {
-            params->flags |= LLAMA_SEQREP_DIVIDE_BY_PENALTY;
-        } else if (argchunk == "min_length") {
-            params->min_length = std::atoi(argval.c_str());
-        } else if (argchunk == "start_offset") {
-            params->start_offset = std::atoi(argval.c_str());
-        } else if (argchunk == "last_n") {
-            params->last_n = std::atoi(argval.c_str());
-        } else if (argchunk == "tolerance") {
-            params->tolerance = std::atof(argval.c_str());
-        } else if (argchunk == "presence_penalty") {
-            params->presence_penalty = std::atof(argval.c_str());
-        } else if (argchunk == "length_penalty") {
-            params->length_penalty = std::atof(argval.c_str());
-        } else if (argchunk == "mid_word_scale") {
-            params->mid_word_scale = std::atof(argval.c_str());
-        } else if (argchunk == "tolerance_match_credit") {
-            params->tolerance_match_credit = std::atof(argval.c_str());
-        } else if (argchunk == "tolerance_half_step_cost") {
-            params->tolerance_half_step_cost = std::atof(argval.c_str());
-        } else {
-            fprintf(stderr, "seqrep: Bad argument [%s]=[%s]!\n", argchunk.c_str(), argval.c_str());
-            return false;
-        }
-        if (argsep != std::string::npos) {
-            offset = argsep + 1;
-        } else {
-            break;
-        }
-    }
-    return true;
-}
-
 bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     bool invalid_param = false;
     std::string arg;
@@ -386,6 +252,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.presence_penalty = std::stof(argv[i]);
+#ifndef LLAMA_NO_SEQREP_SAMPLER
         } else if (arg == "-seqrep" || arg == "--seqrep-penalty") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -405,6 +272,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
                     && (sr_params.presence_penalty != 0.0f || sr_params.length_penalty != 0.0f)) {
                 params.seqrep_params.push_back(sr_params);
             }
+#endif
         } else if (arg == "--mirostat") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -779,8 +647,10 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  --repeat-penalty N    penalize repeat sequence of tokens (default: %.1f, 1.0 = disabled)\n", (double)params.repeat_penalty);
     printf("  --presence-penalty N  repeat alpha presence penalty (default: %.1f, 0.0 = disabled)\n", (double)params.presence_penalty);
     printf("  --frequency-penalty N repeat alpha frequency penalty (default: %.1f, 0.0 = disabled)\n", (double)params.frequency_penalty);
+#ifndef LLAMA_NO_SEQREP_SAMPLER
     printf("  -seqrep CFG, --seqrep-penalty CFG\n");
     printf("                        add a copy of the sequence repetition penalty sampler. may be specified multiple times. for help: -seqrep help\n");
+#endif
     printf("  --mirostat N          use Mirostat sampling.\n");
     printf("                        Top K, Nucleus, Tail Free and Locally Typical samplers are ignored if used.\n");
     printf("                        (default: %d, 0 = disabled, 1 = Mirostat, 2 = Mirostat 2.0)\n", params.mirostat);
@@ -1069,9 +939,11 @@ llama_token llama_sample_token(
                 last_tokens.data() + last_tokens.size() - last_n_repeat,
                 last_n_repeat, alpha_frequency, alpha_presence);
 
+#ifndef LLAMA_NO_SEQREP_SAMPLER
         for (auto & sr_params : params.seqrep_params) {
             llama_sample_seqrep_penalty(ctx, &cur_p, last_tokens.data(), last_tokens.size(), &sr_params);
         }
+#endif
 
         if (!penalize_nl) {
             for (size_t idx = 0; idx < cur_p.size; idx++) {
 
@@ -4,6 +4,10 @@
 
 #include "llama.h"
 
+#ifndef LLAMA_NO_SEQREP_SAMPLER
+#include "seqrep-sampler.h"
+#endif
+
 #define LOG_NO_FILE_LINE_FUNCTION
 #include "log.h"
 
@@ -55,7 +59,9 @@ struct gpt_params {
     int32_t repeat_last_n     = 64;    // last n tokens to penalize (0 = disable penalty, -1 = context size)
     float   frequency_penalty = 0.00f; // 0.0 = disabled
     float   presence_penalty  = 0.00f; // 0.0 = disabled
+#ifndef LLAMA_NO_SEQREP_SAMPLER
     std::vector<llama_sampler_seqrep_params> seqrep_params;
+#endif
     int32_t mirostat          = 0;     // 0 = disabled, 1 = mirostat, 2 = mirostat 2.0
     float   mirostat_tau      = 5.00f; // target entropy
     float   mirostat_eta      = 0.10f; // learning rate
@@ -205,7 +211,3 @@ std::string get_sortable_timestamp();
 void dump_non_result_info_yaml(
     FILE * stream, const gpt_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
-
-void seqrep_sampler_params_init(llama_sampler_seqrep_params * params);
-void seqrep_sampler_params_dump(const llama_sampler_seqrep_params * params);
-bool seqrep_sampler_params_parse(char * s, llama_sampler_seqrep_params * params);