From 9cec39f8061227e21429da998eff3731547caa1a Mon Sep 17 00:00:00 2001 From: Mykhailo Chalyi Date: Wed, 15 Apr 2026 08:14:18 -0500 Subject: [PATCH] fix(fetchers): cap direct llms bodies --- crates/fetchkit/src/fetchers/default.rs | 8 +++---- crates/fetchkit/src/fetchers/docs_site.rs | 28 +++++++++++++++++------ crates/fetchkit/tests/ssrf_security.rs | 27 ++++++++++++++++++++++ 3 files changed, 52 insertions(+), 11 deletions(-) diff --git a/crates/fetchkit/src/fetchers/default.rs b/crates/fetchkit/src/fetchers/default.rs index 23e07a9..c1afb46 100644 --- a/crates/fetchkit/src/fetchers/default.rs +++ b/crates/fetchkit/src/fetchers/default.rs @@ -46,17 +46,17 @@ const BINARY_PREFIXES: &[&str] = &[ const FIRST_BYTE_TIMEOUT: Duration = Duration::from_secs(1); // THREAT[TM-DOS-002]: Body timeout caps total request duration -const BODY_TIMEOUT: Duration = Duration::from_secs(30); +pub(crate) const BODY_TIMEOUT: Duration = Duration::from_secs(30); /// Truncation message appended when body is cut short (timeout or size limit) -const TRUNCATION_MESSAGE: &str = "\n\n[..content truncated...]"; +pub(crate) const TRUNCATION_MESSAGE: &str = "\n\n[..content truncated...]"; // THREAT[TM-SSRF-010]: Maximum redirects to follow with IP validation at each hop const MAX_REDIRECTS: usize = 10; // THREAT[TM-DOS-001]: Default max body size (10 MB) to prevent memory exhaustion // THREAT[TM-DOS-003]: Also protects against compressed content bombs (gzip bombs) -const DEFAULT_MAX_BODY_SIZE: usize = 10 * 1024 * 1024; +pub(crate) const DEFAULT_MAX_BODY_SIZE: usize = 10 * 1024 * 1024; /// Default HTTP fetcher /// @@ -640,7 +640,7 @@ fn parse_content_disposition_filename(value: &str) -> Option { /// due to timeout or exceeding `max_size`. // THREAT[TM-DOS-001]: Configurable max body size prevents unbounded memory usage // THREAT[TM-DOS-003]: Decompressed size is checked, catching gzip/brotli bombs -async fn read_body_with_timeout( +pub(crate) async fn read_body_with_timeout( response: reqwest::Response, timeout: Duration, max_size: usize, diff --git a/crates/fetchkit/src/fetchers/docs_site.rs b/crates/fetchkit/src/fetchers/docs_site.rs index f8b64ac..0db9208 100644 --- a/crates/fetchkit/src/fetchers/docs_site.rs +++ b/crates/fetchkit/src/fetchers/docs_site.rs @@ -10,7 +10,10 @@ use crate::client::FetchOptions; use crate::error::FetchError; -use crate::fetchers::default::{apply_bot_auth_if_enabled, send_request_following_redirects}; +use crate::fetchers::default::{ + apply_bot_auth_if_enabled, read_body_with_timeout, send_request_following_redirects, + BODY_TIMEOUT, DEFAULT_MAX_BODY_SIZE, TRUNCATION_MESSAGE, +}; use crate::fetchers::Fetcher; use crate::types::{FetchRequest, FetchResponse}; use crate::DEFAULT_USER_AGENT; @@ -219,6 +222,11 @@ async fn fetch_llms_txt_direct( let status_code = response.status().as_u16(); let final_url = response.url().to_string(); + let content_type = response + .headers() + .get("content-type") + .and_then(|v| v.to_str().ok()) + .map(|s| s.to_string()); if !response.status().is_success() { return Ok(FetchResponse { @@ -230,17 +238,23 @@ async fn fetch_llms_txt_direct( }); } - let body = response - .text() - .await - .map_err(|e| FetchError::RequestError(e.to_string()))?; + let max_body_size = options.max_body_size.unwrap_or(DEFAULT_MAX_BODY_SIZE); + let (body, truncated) = read_body_with_timeout(response, BODY_TIMEOUT, max_body_size).await; + let size = body.len() as u64; + let mut content = String::from_utf8_lossy(&body).to_string(); + + if truncated { + content.push_str(TRUNCATION_MESSAGE); + } Ok(FetchResponse { url: final_url, status_code: 200, - content_type: Some("text/plain".to_string()), + content_type, format: Some("documentation".to_string()), - content: Some(body), + content: Some(content), + size: Some(size), + truncated: if truncated { Some(true) } else { None }, redirect_chain, ..Default::default() }) diff --git a/crates/fetchkit/tests/ssrf_security.rs b/crates/fetchkit/tests/ssrf_security.rs index c8a28cf..5d16caa 100644 --- a/crates/fetchkit/tests/ssrf_security.rs +++ b/crates/fetchkit/tests/ssrf_security.rs @@ -717,6 +717,33 @@ async fn test_dos_001_body_within_limit_not_truncated() { assert!(resp.content.unwrap().contains("small body")); } +#[tokio::test] +async fn test_dos_001_direct_llms_txt_honors_body_size_limit() { + let mock_server = MockServer::start().await; + let large_body = "x".repeat(2000); + + Mock::given(method("GET")) + .and(path("/llms.txt")) + .respond_with( + ResponseTemplate::new(200) + .set_body_string(&large_body) + .insert_header("content-type", "text/plain"), + ) + .mount(&mock_server) + .await; + + let tool = Tool::builder() + .block_private_ips(false) + .max_body_size(1000) + .build(); + let req = FetchRequest::new(format!("{}/llms.txt", mock_server.uri())); + let resp = tool.execute(req).await.unwrap(); + + assert_eq!(resp.truncated, Some(true)); + assert!(resp.size.unwrap() <= 1000); + assert!(resp.content.unwrap().contains("[..content truncated...]")); +} + // ============================================================================ // TM-INPUT-007: URL-aware prefix matching // ============================================================================