Skip to content

Commit 78a2261

Browse files
committed
fix: Server v2 production issues (PR #788)
Fixes four critical production issues in llamafile Server v2: 1. **Fix .args loading timing** (llama.cpp main/main.cpp) - Move cosmo_args() call before determine_program() - Ensures --server --v2 flags in .args are seen when determining program mode - Fixes #783 2. **Add URL prefix normalization** (llamafile/flags.cpp) - Consolidate consecutive slashes (//api/v1 → /api/v1) - Ensure leading slash, remove trailing slash - Validate AFTER normalization - Use static std::string for proper lifetime management (no memory leak) - Fixes #767 3. **Robust partial write handling** (llamafile/server/client.cpp) - Implement full write loop to handle partial writes correctly - Handle EINTR (signal interruption) gracefully - Properly detect connection closure - Increase file transfer buffer from 512B to 16KB for better performance 4. **Remove aggressive client dropping** (llamafile/server/worker.cpp) - Remove code that kills oldest active connection when all workers busy - Let TCP listen backlog naturally queue incoming connections - Provides better UX (graceful queuing vs abrupt disconnection) - Fixes #787 All fixes improve upon original PR #788 with better error handling and no memory leaks.
1 parent 9509d91 commit 78a2261

File tree

4 files changed

+63
-22
lines changed

4 files changed

+63
-22
lines changed

llama.cpp.patches/patches/main_main.cpp.patch

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -107,7 +107,7 @@
107107

108108
static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
109109
(void) level;
110-
@@ -128,7 +146,91 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
110+
@@ -128,7 +146,94 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
111111
return formatted;
112112
}
113113

@@ -164,6 +164,10 @@
164164
+ __builtin_unreachable();
165165
+ }
166166
+
167+
+ // Load .args file BEFORE determining program type
168+
+ // so that flags like --server --v2 in .args are seen
169+
+ argc = cosmo_args("/zip/.args", &argv);
170+
+
167171
+ enum Program prog = determine_program(argv);
168172
+ if (prog == LLAMAFILER)
169173
+ return lf::server::main(argc, argv);
@@ -172,7 +176,6 @@
172176
+ mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
173177
+ mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
174178
+ ShowCrashReports();
175-
+ argc = cosmo_args("/zip/.args", &argv);
176179
+
177180
+ if (prog == SERVER)
178181
+ return server_cli(argc, argv);

llamafile/flags.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -313,15 +313,39 @@ void llamafile_get_flags(int argc, char **argv) {
313313
if (!strcmp(flag, "--url-prefix")) {
314314
if (i == argc)
315315
missing("--url-prefix");
316-
FLAG_url_prefix = argv[i++];
317-
if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
318-
tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
319-
exit(1);
316+
317+
std::string url_prefix = argv[i++];
318+
319+
// Consolidate consecutive slashes
320+
size_t pos = 0;
321+
while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
322+
url_prefix.replace(pos, 2, "/");
320323
}
321-
if (endswith(FLAG_url_prefix, "/")) {
322-
tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);
324+
325+
// Ensure single slash at start
326+
if (url_prefix.empty() || url_prefix[0] != '/') {
327+
url_prefix = "/" + url_prefix;
328+
}
329+
330+
// Remove trailing slash if present
331+
if (url_prefix.length() > 1 && url_prefix.back() == '/') {
332+
url_prefix.pop_back();
333+
}
334+
335+
// If only a single slash remains, convert to empty string
336+
if (url_prefix == "/") {
337+
url_prefix = "";
338+
}
339+
340+
// Validate the normalized path
341+
if (!url_prefix.empty() && !IsAcceptablePath(url_prefix.c_str(), url_prefix.length())) {
342+
tinyprint(2, "error: --url-prefix must not have /. or /./ or /../ after normalization\n", NULL);
323343
exit(1);
324344
}
345+
346+
// Store in static storage (persists for program lifetime)
347+
static std::string stored_prefix = url_prefix;
348+
FLAG_url_prefix = stored_prefix.c_str();
325349
continue;
326350
}
327351

llamafile/server/client.cpp

Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -522,12 +522,30 @@ Client::send_response_finish()
522522
bool
523523
Client::send_binary(const void* p, size_t n)
524524
{
525-
ssize_t sent;
526-
if ((sent = write(fd_, p, n)) != n) {
527-
if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
528-
SLOG("write failed %m");
529-
close_connection_ = true;
530-
return false;
525+
size_t total_sent = 0;
526+
const char* ptr = (const char*)p;
527+
528+
while (total_sent < n) {
529+
ssize_t sent = write(fd_, ptr + total_sent, n - total_sent);
530+
531+
if (sent > 0) {
532+
total_sent += sent;
533+
} else if (sent == 0) {
534+
// Connection closed
535+
close_connection_ = true;
536+
return false;
537+
} else {
538+
// Error occurred
539+
if (errno == EINTR) {
540+
// Interrupted by signal, retry
541+
continue;
542+
}
543+
if (errno != EAGAIN && errno != ECONNRESET) {
544+
SLOG("write failed %m");
545+
}
546+
close_connection_ = true;
547+
return false;
548+
}
531549
}
532550
return true;
533551
}
@@ -775,7 +793,7 @@ Client::dispatcher()
775793
should_send_error_if_canceled_ = false;
776794
if (!send(std::string_view(obuf_.p, p - obuf_.p)))
777795
return false;
778-
char buf[512];
796+
char buf[16384];
779797
size_t i, chunk;
780798
for (i = 0; i < size; i += chunk) {
781799
chunk = size - i;

llamafile/server/worker.cpp

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,13 +56,9 @@ Worker::begin()
5656
tokens = tokenbucket_acquire(client_.client_ip_);
5757
server_->lock();
5858
dll_remove(&server_->idle_workers, &elem_);
59-
if (dll_is_empty(server_->idle_workers)) {
60-
Dll* slowbro;
61-
if ((slowbro = dll_last(server_->active_workers))) {
62-
SLOG("all threads active! dropping oldest client");
63-
WORKER(slowbro)->kill();
64-
}
65-
}
59+
// Remove aggressive client cancellation - let TCP backlog handle overflow
60+
// The kernel's listen backlog will naturally queue incoming connections
61+
// until a worker becomes available, providing better user experience
6662
working_ = true;
6763
if (tokens > FLAG_token_burst) {
6864
dll_make_last(&server_->active_workers, &elem_);

0 commit comments

Comments
 (0)