Skip to content

Commit c145f8a

Browse files
authored
server : slots monitoring endpoint (#5550)
1 parent 689a091 commit c145f8a

File tree

2 files changed

+96
-0
lines changed

2 files changed

+96
-0
lines changed

examples/server/README.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ see https://github.com/ggerganov/llama.cpp/issues/1437
4040
- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
4141
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
4242
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
43+
- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
4344

4445
## Build
4546

@@ -381,6 +382,69 @@ Notice that each `probs` is an array of length `n_probs`.
381382
}'
382383
```
383384

385+
- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
386+
387+
### Result JSON
388+
389+
```json
390+
[
391+
{
392+
"dynatemp_exponent": 1.0,
393+
"dynatemp_range": 0.0,
394+
"frequency_penalty": 0.0,
395+
"grammar": "",
396+
"id": 0,
397+
"ignore_eos": false,
398+
"logit_bias": [],
399+
"min_p": 0.05000000074505806,
400+
"mirostat": 0,
401+
"mirostat_eta": 0.10000000149011612,
402+
"mirostat_tau": 5.0,
403+
"model": "llama-2-7b-32k-instruct.Q2_K.gguf",
404+
"n_ctx": 2048,
405+
"n_keep": 0,
406+
"n_predict": 100000,
407+
"n_probs": 0,
408+
"next_token": {
409+
"has_next_token": true,
410+
"n_remain": -1,
411+
"num_tokens_predicted": 0,
412+
"stopped_eos": false,
413+
"stopped_limit": false,
414+
"stopped_word": false,
415+
"stopping_word": ""
416+
},
417+
"penalize_nl": true,
418+
"penalty_prompt_tokens": [],
419+
"presence_penalty": 0.0,
420+
"prompt": "Say hello to llama.cpp",
421+
"repeat_last_n": 64,
422+
"repeat_penalty": 1.100000023841858,
423+
"samplers": [
424+
"top_k",
425+
"tfs_z",
426+
"typical_p",
427+
"top_p",
428+
"min_p",
429+
"temperature"
430+
],
431+
"seed": 42,
432+
"state": 1,
433+
"stop": [
434+
"\n"
435+
],
436+
"stream": false,
437+
"task_id": 0,
438+
"temperature": 0.0,
439+
"tfs_z": 1.0,
440+
"top_k": 40,
441+
"top_p": 0.949999988079071,
442+
"typical_p": 1.0,
443+
"use_penalty_prompt_tokens": false
444+
}
445+
]
446+
```
447+
384448
## More examples
385449

386450
### Change system prompt on runtime

examples/server/server.cpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ struct server_params
4141
int32_t port = 8080;
4242
int32_t read_timeout = 600;
4343
int32_t write_timeout = 600;
44+
bool slots_endpoint = true;
4445
};
4546

4647
bool server_verbose = false;
@@ -1926,6 +1927,7 @@ static void server_print_usage(const char *argv0, const gpt_params &params,
19261927
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
19271928
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
19281929
printf(" --log-disable disables logging to a file.\n");
1930+
printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
19291931
printf("\n");
19301932
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
19311933
printf(" --override-kv KEY=TYPE:VALUE\n");
@@ -2374,6 +2376,10 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
23742376
log_set_target(stdout);
23752377
LOG_INFO("logging to file is disabled.", {});
23762378
}
2379+
else if (arg == "--slots-endpoint-disable")
2380+
{
2381+
sparams.slots_endpoint = false;
2382+
}
23772383
else if (arg == "--chat-template")
23782384
{
23792385
if (++i >= argc)
@@ -2619,6 +2625,32 @@ int main(int argc, char **argv)
26192625
}
26202626
});
26212627

2628+
if (sparams.slots_endpoint) {
2629+
svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
2630+
json slots;
2631+
for (llama_client_slot & slot : llama.slots) {
2632+
json slot_data = llama.get_formated_generation(slot);
2633+
slot_data["id"] = slot.id;
2634+
slot_data["task_id"] = slot.task_id;
2635+
slot_data["state"] = slot.state;
2636+
slot_data["prompt"] = slot.prompt;
2637+
slot_data["next_token"] = {
2638+
{"has_next_token", slot.has_next_token},
2639+
{"n_remain", slot.n_remaining},
2640+
{"num_tokens_predicted", slot.n_decoded},
2641+
{"stopped_eos", slot.stopped_eos},
2642+
{"stopped_word", slot.stopped_word},
2643+
{"stopped_limit", slot.stopped_limit},
2644+
{"stopping_word", slot.stopping_word},
2645+
};
2646+
2647+
slots.push_back(slot_data);
2648+
}
2649+
res.set_content(slots.dump(), "application/json");
2650+
res.status = 200; // HTTP OK
2651+
});
2652+
}
2653+
26222654
svr.set_logger(log_server_request);
26232655

26242656
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)

0 commit comments

Comments
 (0)