fix: configurable timeout for AI requests (#6497) (#7267)

devdattatalele · web-flow · commit 764e1e15f5a8 · 2025-12-02T13:01:50.000Z
* fix: configurable timeout for AI requests Add AI_REQUEST_TIMEOUT_SECONDS environment variable (default 3600s) to fix timeout issues with slow AI models like self-hosted Ollama. Previously hardcoded at 300 seconds, causing legitimate long-running requests to fail. Fixes #6497 * docs(ai): add critical NGINX configuration warning Add comprehensive documentation about reverse proxy timeout requirements. Without proper NGINX/proxy configuration, connections will still timeout at the proxy layer regardless of backend timeout settings. Enhanced documentation includes: - CRITICAL warning about proxy configuration requirement - Example NGINX configuration snippet - Explanation of proxy vs backend timeout interaction This addresses the root cause in issue #6497 where logs showed "upstream prematurely closed connection" indicating proxy-level timeout. Part of #6497
diff --git a/backend/windmill-api/src/ai.rs b/backend/windmill-api/src/ai.rs
@@ -14,11 +14,76 @@ use windmill_common::error::{to_anyhow, Error, Result};
 use windmill_common::utils::configure_client;
 use windmill_common::variables::get_variable_or_self;
 
+// AI timeout configuration constants
+const AI_TIMEOUT_MIN_SECS: u64 = 1;
+const AI_TIMEOUT_MAX_SECS: u64 = 86400; // 24 hours
+const AI_TIMEOUT_DEFAULT_SECS: u64 = 3600; // 1 hour
+const HTTP_POOL_MAX_IDLE_PER_HOST: usize = 10;
+const HTTP_POOL_IDLE_TIMEOUT_SECS: u64 = 90;
+
 lazy_static::lazy_static! {
+    /// AI request timeout in seconds.
+    ///
+    /// This timeout applies to the TOTAL duration of AI HTTP requests,
+    /// including streaming responses. Default is 3600 seconds (1 hour).
+    ///
+    /// Can be configured via AI_REQUEST_TIMEOUT_SECONDS environment variable.
+    /// Valid range: 1-86400 seconds (24 hours).
+    ///   - Minimum (1s): Prevents immediate timeout, allows minimal response time
+    ///   - Maximum (24h): Prevents indefinite hangs while supporting long-running AI operations
+    ///   - Default (1h): Balances responsiveness with support for complex AI tasks
+    ///
+    /// Note: This is a total request timeout, not an idle timeout.
+    /// Long-running streaming responses that exceed this duration will be terminated,
+    /// even if actively receiving data.
+    ///
+    /// CRITICAL: If using a reverse proxy (NGINX, Traefik, etc.), you MUST configure
+    /// proxy timeouts to match or exceed this value. Without proper proxy configuration,
+    /// connections will be terminated prematurely at the proxy layer regardless of this
+    /// backend timeout setting.
+    ///
+    /// Example NGINX configuration:
+    ///   location /api/ {
+    ///     proxy_read_timeout 3600s;  # Must be >= AI_REQUEST_TIMEOUT_SECONDS
+    ///     proxy_send_timeout 3600s;
+    ///     proxy_connect_timeout 60s;
+    ///   }
+    static ref AI_TIMEOUT_SECS: u64 = {
+        match std::env::var("AI_REQUEST_TIMEOUT_SECONDS")
+            .ok()
+            .and_then(|s| s.parse::<u64>().ok())
+        {
+            Some(timeout) if timeout >= AI_TIMEOUT_MIN_SECS && timeout <= AI_TIMEOUT_MAX_SECS => {
+                tracing::info!("AI request timeout configured: {}s", timeout);
+                timeout
+            },
+            Some(timeout) => {
+                tracing::warn!(
+                    "AI_REQUEST_TIMEOUT_SECONDS value {} is out of range ({}-{}), using default {}s",
+                    timeout,
+                    AI_TIMEOUT_MIN_SECS,
+                    AI_TIMEOUT_MAX_SECS,
+                    AI_TIMEOUT_DEFAULT_SECS
+                );
+                AI_TIMEOUT_DEFAULT_SECS
+            },
+            None => {
+                tracing::info!(
+                    "AI_REQUEST_TIMEOUT_SECONDS not set, using default {}s",
+                    AI_TIMEOUT_DEFAULT_SECS
+                );
+                AI_TIMEOUT_DEFAULT_SECS
+            },
+        }
+    };
+
     static ref HTTP_CLIENT: Client = configure_client(reqwest::ClientBuilder::new()
-        .timeout(std::time::Duration::from_secs(60 * 5))
+        .timeout(std::time::Duration::from_secs(*AI_TIMEOUT_SECS))
+        .pool_max_idle_per_host(HTTP_POOL_MAX_IDLE_PER_HOST)
+        .pool_idle_timeout(Some(std::time::Duration::from_secs(HTTP_POOL_IDLE_TIMEOUT_SECS)))
         .user_agent("windmill/beta"))
-        .build().unwrap();
+        .build()
+        .expect("Failed to build AI HTTP client - check system TLS configuration");
 
     static ref OPENAI_AZURE_BASE_PATH: Option<String> = std::env::var("OPENAI_AZURE_BASE_PATH").ok();
 
diff --git a/backend/windmill-common/src/global_settings.rs b/backend/windmill-common/src/global_settings.rs
@@ -114,6 +114,7 @@ pub const ENV_SETTINGS: &[&str] = &[
     "DISABLE_S3_STORE",
     "PG_SCHEMA",
     "PG_LISTENER_REFRESH_PERIOD_SECS",
+    "AI_REQUEST_TIMEOUT_SECONDS",
 ];
 
 use crate::error;