fix: Server v2 production issues (PR #788)

anivar · anivar · commit 78a2261dbfc2 · 2025-12-04T11:30:07.000-05:00
Fixes four critical production issues in llamafile Server v2: 1. **Fix .args loading timing** (llama.cpp main/main.cpp) - Move cosmo_args() call before determine_program() - Ensures --server --v2 flags in .args are seen when determining program mode - Fixes #783 2. **Add URL prefix normalization** (llamafile/flags.cpp) - Consolidate consecutive slashes (//api/v1 → /api/v1) - Ensure leading slash, remove trailing slash - Validate AFTER normalization - Use static std::string for proper lifetime management (no memory leak) - Fixes #767 3. **Robust partial write handling** (llamafile/server/client.cpp) - Implement full write loop to handle partial writes correctly - Handle EINTR (signal interruption) gracefully - Properly detect connection closure - Increase file transfer buffer from 512B to 16KB for better performance 4. **Remove aggressive client dropping** (llamafile/server/worker.cpp) - Remove code that kills oldest active connection when all workers busy - Let TCP listen backlog naturally queue incoming connections - Provides better UX (graceful queuing vs abrupt disconnection) - Fixes #787 All fixes improve upon original PR #788 with better error handling and no memory leaks.
diff --git a/llama.cpp.patches/patches/main_main.cpp.patch b/llama.cpp.patches/patches/main_main.cpp.patch
@@ -107,7 +107,7 @@
 
  static void llama_log_callback_logTee(ggml_log_level level, const char * text, void * user_data) {
      (void) level;
-@@ -128,7 +146,91 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
+@@ -128,7 +146,94 @@ static std::string chat_add_and_format(struct llama_model * model, std::vector<l
      return formatted;
  }
 
@@ -164,6 +164,10 @@
 +        __builtin_unreachable();
 +    }
 +
++    // Load .args file BEFORE determining program type
++    // so that flags like --server --v2 in .args are seen
++    argc = cosmo_args("/zip/.args", &argv);
++
 +    enum Program prog = determine_program(argv);
 +    if (prog == LLAMAFILER)
 +        return lf::server::main(argc, argv);
@@ -172,7 +176,6 @@
 +    mallopt(M_MMAP_THRESHOLD, 16 * 1024 * 1024);
 +    mallopt(M_TRIM_THRESHOLD, 128 * 1024 * 1024);
 +    ShowCrashReports();
-+    argc = cosmo_args("/zip/.args", &argv);
 +
 +    if (prog == SERVER)
 +        return server_cli(argc, argv);
diff --git a/llamafile/flags.cpp b/llamafile/flags.cpp
@@ -313,15 +313,39 @@ void llamafile_get_flags(int argc, char **argv) {
         if (!strcmp(flag, "--url-prefix")) {
             if (i == argc)
                 missing("--url-prefix");
-            FLAG_url_prefix = argv[i++];
-            if (!IsAcceptablePath(FLAG_url_prefix, -1)) {
-                tinyprint(2, "error: --url-prefix must not have // or /. or /./ or /../\n", NULL);
-                exit(1);
+
+            std::string url_prefix = argv[i++];
+
+            // Consolidate consecutive slashes
+            size_t pos = 0;
+            while ((pos = url_prefix.find("//", pos)) != std::string::npos) {
+                url_prefix.replace(pos, 2, "/");
             }
-            if (endswith(FLAG_url_prefix, "/")) {
-                tinyprint(2, "error: --url-prefix must not be slash or end with slash\n", NULL);
+
+            // Ensure single slash at start
+            if (url_prefix.empty() || url_prefix[0] != '/') {
+                url_prefix = "/" + url_prefix;
+            }
+
+            // Remove trailing slash if present
+            if (url_prefix.length() > 1 && url_prefix.back() == '/') {
+                url_prefix.pop_back();
+            }
+
+            // If only a single slash remains, convert to empty string
+            if (url_prefix == "/") {
+                url_prefix = "";
+            }
+
+            // Validate the normalized path
+            if (!url_prefix.empty() && !IsAcceptablePath(url_prefix.c_str(), url_prefix.length())) {
+                tinyprint(2, "error: --url-prefix must not have /. or /./ or /../ after normalization\n", NULL);
                 exit(1);
             }
+
+            // Store in static storage (persists for program lifetime)
+            static std::string stored_prefix = url_prefix;
+            FLAG_url_prefix = stored_prefix.c_str();
             continue;
         }
 
diff --git a/llamafile/server/client.cpp b/llamafile/server/client.cpp
@@ -522,12 +522,30 @@ Client::send_response_finish()
 bool
 Client::send_binary(const void* p, size_t n)
 {
-    ssize_t sent;
-    if ((sent = write(fd_, p, n)) != n) {
-        if (sent == -1 && errno != EAGAIN && errno != ECONNRESET)
-            SLOG("write failed %m");
-        close_connection_ = true;
-        return false;
+    size_t total_sent = 0;
+    const char* ptr = (const char*)p;
+
+    while (total_sent < n) {
+        ssize_t sent = write(fd_, ptr + total_sent, n - total_sent);
+
+        if (sent > 0) {
+            total_sent += sent;
+        } else if (sent == 0) {
+            // Connection closed
+            close_connection_ = true;
+            return false;
+        } else {
+            // Error occurred
+            if (errno == EINTR) {
+                // Interrupted by signal, retry
+                continue;
+            }
+            if (errno != EAGAIN && errno != ECONNRESET) {
+                SLOG("write failed %m");
+            }
+            close_connection_ = true;
+            return false;
+        }
     }
     return true;
 }
@@ -775,7 +793,7 @@ Client::dispatcher()
     should_send_error_if_canceled_ = false;
     if (!send(std::string_view(obuf_.p, p - obuf_.p)))
         return false;
-    char buf[512];
+    char buf[16384];
     size_t i, chunk;
     for (i = 0; i < size; i += chunk) {
         chunk = size - i;
diff --git a/llamafile/server/worker.cpp b/llamafile/server/worker.cpp
@@ -56,13 +56,9 @@ Worker::begin()
         tokens = tokenbucket_acquire(client_.client_ip_);
     server_->lock();
     dll_remove(&server_->idle_workers, &elem_);
-    if (dll_is_empty(server_->idle_workers)) {
-        Dll* slowbro;
-        if ((slowbro = dll_last(server_->active_workers))) {
-            SLOG("all threads active! dropping oldest client");
-            WORKER(slowbro)->kill();
-        }
-    }
+    // Remove aggressive client cancellation - let TCP backlog handle overflow
+    // The kernel's listen backlog will naturally queue incoming connections
+    // until a worker becomes available, providing better user experience
     working_ = true;
     if (tokens > FLAG_token_burst) {
         dll_make_last(&server_->active_workers, &elem_);