Skip to content

Commit ac52243

Browse files
Neha AbbasNeha Abbas
authored andcommitted
most recent merge
2 parents 2c57726 + 4ad0986 commit ac52243

File tree

99 files changed

+3977
-357
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

99 files changed

+3977
-357
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ LLM inference in C/C++
1717

1818
## Hot topics
1919

20+
- Support for the `gpt-oss` model with native MXFP4 format has been added | [PR](https://github.com/ggml-org/llama.cpp/pull/15091) | [Collaboration with NVIDIA](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss) | [Comment](https://github.com/ggml-org/llama.cpp/discussions/15095)
2021
- Hot PRs: [All](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+) | [Open](https://github.com/ggml-org/llama.cpp/pulls?q=is%3Apr+label%3Ahot+is%3Aopen)
2122
- Multimodal support arrived in `llama-server`: [#12898](https://github.com/ggml-org/llama.cpp/pull/12898) | [documentation](./docs/multimodal.md)
2223
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode

common/arg.cpp

Lines changed: 28 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@
2424
#include <cstdarg>
2525
#include <filesystem>
2626
#include <fstream>
27+
#include <list>
2728
#include <regex>
2829
#include <set>
2930
#include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
23752376
}
23762377
throw std::invalid_argument("unknown buffer type");
23772378
}
2378-
// FIXME: this leaks memory
2379-
params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
2379+
// keep strings alive and avoid leaking memory by storing them in a static vector
2380+
static std::list<std::string> buft_overrides;
2381+
buft_overrides.push_back(tensor_name);
2382+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
23802383
}
23812384
}
23822385
));
23832386
add_opt(common_arg(
2384-
{"--cpu-moe"},
2385-
"use CPU for Mixture of Experts (MoE) weights",
2387+
{"--cpu-moe", "-cmoe"},
2388+
"keep all Mixture of Experts (MoE) weights in the CPU",
23862389
[](common_params & params) {
2387-
params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2388-
params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2389-
params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
2390+
params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
23902391
}
23912392
).set_env("LLAMA_ARG_CPU_MOE"));
2393+
add_opt(common_arg(
2394+
{"--n-cpu-moe", "-ncmoe"}, "N",
2395+
"keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
2396+
[](common_params & params, int value) {
2397+
if (value < 0) {
2398+
throw std::invalid_argument("invalid value");
2399+
}
2400+
for (int i = 0; i < value; ++i) {
2401+
// keep strings alive and avoid leaking memory by storing them in a static vector
2402+
static std::list<std::string> buft_overrides;
2403+
buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
2404+
params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
2405+
}
2406+
}
2407+
).set_env("LLAMA_ARG_N_CPU_MOE"));
23922408
add_opt(common_arg(
23932409
{"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
23942410
"number of layers to store in VRAM",
@@ -2649,10 +2665,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26492665
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
26502666
add_opt(common_arg(
26512667
{"--output-format"}, "{gguf,dat}",
2652-
string_format("output format for imatrix file (default: %s)", params.imat_dat ? "dat" : "gguf"),
2668+
string_format("output format for imatrix file (default: %s)", params.imat_dat > 0 ? "dat" : "gguf"),
26532669
[](common_params & params, const std::string & value) {
2654-
/**/ if (value == "gguf") { params.imat_dat = false; }
2655-
else if (value == "dat") { params.imat_dat = true; }
2670+
/**/ if (value == "gguf") { params.imat_dat = -1; }
2671+
else if (value == "dat") { params.imat_dat = 1; }
26562672
else { throw std::invalid_argument("invalid output format"); }
26572673
}
26582674
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
@@ -2931,11 +2947,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
29312947
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
29322948
"- none: leaves thoughts unparsed in `message.content`\n"
29332949
"- deepseek: puts thoughts in `message.reasoning_content` (except in streaming mode, which behaves as `none`)\n"
2934-
"(default: deepseek)",
2950+
"(default: auto)",
29352951
[](common_params & params, const std::string & value) {
29362952
/**/ if (value == "deepseek") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; }
29372953
else if (value == "deepseek-legacy") { params.reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY; }
29382954
else if (value == "none") { params.reasoning_format = COMMON_REASONING_FORMAT_NONE; }
2955+
else if (value == "auto") { params.reasoning_format = COMMON_REASONING_FORMAT_AUTO; }
29392956
else { throw std::invalid_argument("invalid value"); }
29402957
}
29412958
).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MAIN}).set_env("LLAMA_ARG_THINK"));

common/chat.cpp

Lines changed: 48 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ std::vector<common_chat_msg_diff> common_chat_msg_diff::compute_diffs(const comm
126126
typedef minja::chat_template common_chat_template;
127127

128128
struct common_chat_templates {
129+
bool add_bos;
130+
bool add_eos;
129131
bool has_explicit_template; // Model had builtin template or template overridde was specified.
130132
std::unique_ptr<common_chat_template> template_default; // always set (defaults to chatml)
131133
std::unique_ptr<common_chat_template> template_tool_use;
@@ -143,6 +145,8 @@ struct templates_params {
143145
bool enable_thinking = true;
144146
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
145147
json extra_context;
148+
bool add_bos;
149+
bool add_eos;
146150
};
147151

148152
common_chat_tool_choice common_chat_tool_choice_parse_oaicompat(const std::string & tool_choice) {
@@ -445,6 +449,8 @@ std::string common_chat_format_single(
445449

446450
common_chat_templates_inputs inputs;
447451
inputs.use_jinja = use_jinja;
452+
inputs.add_bos = tmpls->add_bos;
453+
inputs.add_eos = tmpls->add_eos;
448454

449455
std::string fmt_past_msg;
450456
if (!past_msg.empty()) {
@@ -469,6 +475,8 @@ std::string common_chat_format_single(
469475
std::string common_chat_format_example(const struct common_chat_templates * tmpls, bool use_jinja) {
470476
common_chat_templates_inputs inputs;
471477
inputs.use_jinja = use_jinja;
478+
inputs.add_bos = tmpls->add_bos;
479+
inputs.add_eos = tmpls->add_eos;
472480
auto add_simple_msg = [&](auto role, auto content) {
473481
common_chat_msg msg;
474482
msg.role = role;
@@ -546,6 +554,8 @@ common_chat_templates_ptr common_chat_templates_init(
546554
}
547555
std::string token_bos = bos_token_override;
548556
std::string token_eos = eos_token_override;
557+
bool add_bos = false;
558+
bool add_eos = false;
549559
if (model) {
550560
const auto * vocab = llama_model_get_vocab(model);
551561
const auto get_token = [&](llama_token token, const char * name, const char * jinja_variable_name) {
@@ -560,9 +570,13 @@ common_chat_templates_ptr common_chat_templates_init(
560570
};
561571
token_bos = get_token(llama_vocab_bos(vocab), "BOS", "bos_token");
562572
token_eos = get_token(llama_vocab_eos(vocab), "EOS", "eos_token");
573+
add_bos = llama_vocab_get_add_bos(vocab);
574+
add_eos = llama_vocab_get_add_eos(vocab);
563575
}
564576
common_chat_templates_ptr tmpls(new common_chat_templates());
565577
tmpls->has_explicit_template = has_explicit_template;
578+
tmpls->add_bos = add_bos;
579+
tmpls->add_eos = add_eos;
566580
try {
567581
tmpls->template_default = std::make_unique<minja::chat_template>(default_template_src, token_bos, token_eos);
568582
} catch (const std::exception & e) {
@@ -592,6 +606,7 @@ const char * common_chat_format_name(common_chat_format format) {
592606
case COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1: return "Functionary v3.1 Llama 3.1";
593607
case COMMON_CHAT_FORMAT_HERMES_2_PRO: return "Hermes 2 Pro";
594608
case COMMON_CHAT_FORMAT_COMMAND_R7B: return "Command R7B";
609+
case COMMON_CHAT_FORMAT_GPT_OSS: return "GPT-OSS";
595610
default:
596611
throw std::runtime_error("Unknown chat format");
597612
}
@@ -600,6 +615,7 @@ const char * common_chat_format_name(common_chat_format format) {
600615
const char * common_reasoning_format_name(common_reasoning_format format) {
601616
switch (format) {
602617
case COMMON_REASONING_FORMAT_NONE: return "none";
618+
case COMMON_REASONING_FORMAT_AUTO: return "auto";
603619
case COMMON_REASONING_FORMAT_DEEPSEEK: return "deepseek";
604620
case COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY: return "deepseek-legacy";
605621
default:
@@ -748,10 +764,10 @@ static std::string apply(
748764
// instead of using `chat_template_options.use_bos_token = false`, since these tokens
749765
// may be needed inside the template / between messages too.
750766
auto result = tmpl.apply(tmpl_inputs, tmpl_opts);
751-
if (string_starts_with(result, tmpl.bos_token())) {
767+
if (inputs.add_bos && string_starts_with(result, tmpl.bos_token())) {
752768
result = result.substr(tmpl.bos_token().size());
753769
}
754-
if (string_ends_with(result, tmpl.eos_token())) {
770+
if (inputs.add_eos && string_ends_with(result, tmpl.eos_token())) {
755771
result = result.substr(0, result.size() - tmpl.eos_token().size());
756772
}
757773
return result;
@@ -1289,6 +1305,26 @@ static void common_chat_parse_deepseek_r1(common_chat_msg_parser & builder) {
12891305
tool_calls_end);
12901306
}
12911307

1308+
static common_chat_params common_chat_params_init_gpt_oss(const common_chat_template & tmpl, const struct templates_params & inputs) {
1309+
common_chat_params data;
1310+
auto prompt = apply(tmpl, inputs);
1311+
1312+
data.prompt = prompt;
1313+
data.format = COMMON_CHAT_FORMAT_GPT_OSS;
1314+
1315+
// TODO: support tool calls in GPT-OSS?
1316+
1317+
return data;
1318+
}
1319+
static void common_chat_parse_gpt_oss(common_chat_msg_parser & builder) {
1320+
// TODO @ngxson : this won't work with --special enabled, we should fix that
1321+
builder.try_parse_reasoning("<|channel|>analysis<|message|>", "<|start|>assistant<|channel|>final<|message|>");
1322+
if (!builder.syntax().parse_tool_calls) {
1323+
builder.add_content(builder.consume_rest());
1324+
return;
1325+
}
1326+
}
1327+
12921328
static common_chat_params common_chat_params_init_firefunction_v2(const common_chat_template & tmpl, const struct templates_params & inputs) {
12931329
LOG_DBG("%s\n", __func__);
12941330
common_chat_params data;
@@ -1731,6 +1767,8 @@ static common_chat_params common_chat_templates_apply_jinja(
17311767
params.enable_thinking = inputs.enable_thinking;
17321768
params.grammar = inputs.grammar;
17331769
params.now = inputs.now;
1770+
params.add_bos = inputs.add_bos;
1771+
params.add_eos = inputs.add_eos;
17341772

17351773
params.extra_context = json::object();
17361774
for (auto el : inputs.chat_template_kwargs) {
@@ -1772,6 +1810,11 @@ static common_chat_params common_chat_templates_apply_jinja(
17721810
return common_chat_params_init_hermes_2_pro(tmpl, params);
17731811
}
17741812

1813+
// GPT-OSS
1814+
if (src.find("<|channel|>") != std::string::npos && params.json_schema.is_null()) {
1815+
return common_chat_params_init_gpt_oss(tmpl, params);
1816+
}
1817+
17751818
// Use generic handler when mixing tools + JSON schema.
17761819
// TODO: support that mix in handlers below.
17771820
if ((params.tools.is_array() && params.json_schema.is_object())) {
@@ -1923,6 +1966,9 @@ static void common_chat_parse(common_chat_msg_parser & builder) {
19231966
case COMMON_CHAT_FORMAT_COMMAND_R7B:
19241967
common_chat_parse_command_r7b(builder);
19251968
break;
1969+
case COMMON_CHAT_FORMAT_GPT_OSS:
1970+
common_chat_parse_gpt_oss(builder);
1971+
break;
19261972
default:
19271973
throw std::runtime_error(std::string("Unsupported format: ") + common_chat_format_name(builder.syntax().format));
19281974
}

common/chat.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,7 @@ enum common_chat_format {
109109
COMMON_CHAT_FORMAT_FUNCTIONARY_V3_1_LLAMA_3_1,
110110
COMMON_CHAT_FORMAT_HERMES_2_PRO,
111111
COMMON_CHAT_FORMAT_COMMAND_R7B,
112+
COMMON_CHAT_FORMAT_GPT_OSS,
112113

113114
COMMON_CHAT_FORMAT_COUNT, // Not a format, just the # formats
114115
};
@@ -127,6 +128,8 @@ struct common_chat_templates_inputs {
127128
bool enable_thinking = true;
128129
std::chrono::system_clock::time_point now = std::chrono::system_clock::now();
129130
std::map<std::string, std::string> chat_template_kwargs;
131+
bool add_bos = false;
132+
bool add_eos = false;
130133
};
131134

132135
struct common_chat_params {

common/common.h

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,7 @@ struct common_params_diffusion {
236236

237237
enum common_reasoning_format {
238238
COMMON_REASONING_FORMAT_NONE,
239+
COMMON_REASONING_FORMAT_AUTO,
239240
COMMON_REASONING_FORMAT_DEEPSEEK_LEGACY, // Extract thinking tag contents and return as `message.reasoning_content`, or leave inline in <think> tags in stream mode
240241
COMMON_REASONING_FORMAT_DEEPSEEK, // Extract thinking tag contents and return as `message.reasoning_content`, including in streaming deltas.
241242
};
@@ -394,7 +395,7 @@ struct common_params {
394395
std::string chat_template = ""; // NOLINT
395396
bool use_jinja = false; // NOLINT
396397
bool enable_chat_template = true;
397-
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
398+
common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_AUTO;
398399
int reasoning_budget = -1;
399400
bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
400401

@@ -439,7 +440,7 @@ struct common_params {
439440
int32_t n_out_freq = 10; // output the imatrix every n_out_freq iterations
440441
int32_t n_save_freq = 0; // save the imatrix every n_save_freq iterations
441442
int32_t i_chunk = 0; // start processing from this chunk
442-
bool imat_dat = false; // whether the legacy imatrix.dat format should be output
443+
int8_t imat_dat = 0; // whether the legacy imatrix.dat format should be output (gguf <= 0 < dat)
443444

444445
bool process_output = false; // collect data for the output tensor
445446
bool compute_ppl = true; // whether to compute perplexity

0 commit comments

Comments
 (0)