Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 65 additions & 46 deletions crates/fetchkit/src/fetchers/docs_site.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,12 @@

use crate::client::FetchOptions;
use crate::error::FetchError;
use crate::fetchers::default::{apply_bot_auth_if_enabled, send_request_following_redirects};
use crate::fetchers::Fetcher;
use crate::types::{FetchRequest, FetchResponse};
use crate::DEFAULT_USER_AGENT;
use async_trait::async_trait;
use reqwest::header::{HeaderValue, USER_AGENT};
use reqwest::header::{HeaderMap, HeaderValue, ACCEPT, USER_AGENT};
use std::time::Duration;
use url::Url;

Expand Down Expand Up @@ -102,28 +103,13 @@ impl Fetcher for DocsSiteFetcher {
options: &FetchOptions,
) -> Result<FetchResponse, FetchError> {
let url = Url::parse(&request.url).map_err(|_| FetchError::InvalidUrlScheme)?;

let user_agent = options.user_agent.as_deref().unwrap_or(DEFAULT_USER_AGENT);
let mut client_builder = reqwest::Client::builder()
.connect_timeout(PROBE_TIMEOUT)
.timeout(PROBE_TIMEOUT)
.redirect(reqwest::redirect::Policy::limited(5));

if !options.respect_proxy_env {
// THREAT[TM-NET-004]: Ignore ambient proxy env by default
client_builder = client_builder.no_proxy();
}

let client = client_builder
.build()
.map_err(FetchError::ClientBuildError)?;

let ua_header = HeaderValue::from_str(user_agent)
.unwrap_or_else(|_| HeaderValue::from_static(DEFAULT_USER_AGENT));

// If this IS a direct llms.txt URL, fetch it directly
if Self::is_llms_txt_url(&url) {
return fetch_llms_txt_direct(&client, &request.url, &ua_header, request).await;
return fetch_llms_txt_direct(url, ua_header, options).await;
}

// For docs sites, probe for llms.txt at origin
Expand All @@ -141,7 +127,8 @@ impl Fetcher for DocsSiteFetcher {
];

for (probe_url, source) in &probe_urls {
if let Some(content) = try_fetch_llms_txt(&client, probe_url, &ua_header).await {
let probe_url = Url::parse(probe_url).map_err(|_| FetchError::InvalidUrlScheme)?;
if let Some(content) = try_fetch_llms_txt(probe_url, ua_header.clone(), options).await {
return Ok(FetchResponse {
url: request.url.clone(),
status_code: 200,
Expand All @@ -154,14 +141,24 @@ impl Fetcher for DocsSiteFetcher {
}

// No llms.txt — fetch the docs page directly and return raw content
let response = client
.get(&request.url)
.header(USER_AGENT, ua_header)
.send()
.await
.map_err(FetchError::from_reqwest)?;
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, ua_header);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/html, text/plain, text/markdown, */*"),
);
let headers = apply_bot_auth_if_enabled(headers, options, &url);
let (response, redirect_chain) = send_request_following_redirects(
url,
reqwest::Method::GET,
headers,
options,
PROBE_TIMEOUT,
)
.await?;

let status_code = response.status().as_u16();
let final_url = response.url().to_string();
let content_type = response
.headers()
.get("content-type")
Expand All @@ -187,36 +184,47 @@ impl Fetcher for DocsSiteFetcher {
};

Ok(FetchResponse {
url: request.url.clone(),
url: final_url,
status_code,
content_type,
format: Some(format),
content: Some(content),
redirect_chain,
..Default::default()
})
}
}

/// Fetch a direct llms.txt URL
async fn fetch_llms_txt_direct(
client: &reqwest::Client,
url: &str,
ua_header: &HeaderValue,
request: &FetchRequest,
url: Url,
ua_header: HeaderValue,
options: &FetchOptions,
) -> Result<FetchResponse, FetchError> {
let response = client
.get(url)
.header(USER_AGENT, ua_header.clone())
.send()
.await
.map_err(FetchError::from_reqwest)?;
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, ua_header);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/plain, text/markdown, */*"),
);
let headers = apply_bot_auth_if_enabled(headers, options, &url);
let (response, redirect_chain) = send_request_following_redirects(
url,
reqwest::Method::GET,
headers,
options,
PROBE_TIMEOUT,
)
.await?;

let status_code = response.status().as_u16();
let final_url = response.url().to_string();

if !response.status().is_success() {
return Ok(FetchResponse {
url: request.url.clone(),
url: final_url,
status_code,
redirect_chain,
error: Some(format!("HTTP {}", status_code)),
..Default::default()
});
Expand All @@ -228,27 +236,38 @@ async fn fetch_llms_txt_direct(
.map_err(|e| FetchError::RequestError(e.to_string()))?;

Ok(FetchResponse {
url: request.url.clone(),
url: final_url,
status_code: 200,
content_type: Some("text/plain".to_string()),
format: Some("documentation".to_string()),
content: Some(body),
redirect_chain,
..Default::default()
})
}

/// Try to fetch an llms.txt URL. Returns Some(content) on success.
async fn try_fetch_llms_txt(
client: &reqwest::Client,
url: &str,
ua_header: &HeaderValue,
url: Url,
ua_header: HeaderValue,
options: &FetchOptions,
) -> Option<String> {
let response = client
.get(url)
.header(USER_AGENT, ua_header.clone())
.send()
.await
.ok()?;
let mut headers = HeaderMap::new();
headers.insert(USER_AGENT, ua_header);
headers.insert(
ACCEPT,
HeaderValue::from_static("text/plain, text/markdown, */*"),
);
let headers = apply_bot_auth_if_enabled(headers, options, &url);
let (response, _) = send_request_following_redirects(
url,
reqwest::Method::GET,
headers,
options,
PROBE_TIMEOUT,
)
.await
.ok()?;

if !response.status().is_success() {
return None;
Expand Down
52 changes: 52 additions & 0 deletions crates/fetchkit/tests/ssrf_security.rs
Original file line number Diff line number Diff line change
Expand Up @@ -529,6 +529,58 @@ async fn test_ssrf_010_rss_fetcher_enforces_same_host_redirect_policy() {
assert!(matches!(result, Err(FetchError::BlockedUrl)));
}

#[tokio::test]
async fn test_ssrf_010_docs_site_blocks_loopback_llms_txt_by_default() {
let mock_server = MockServer::start().await;

Mock::given(method("GET"))
.and(path("/llms.txt"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string("Docs for agents")
.insert_header("content-type", "text/plain"),
)
.mount(&mock_server)
.await;

let req = FetchRequest::new(format!("{}/llms.txt", mock_server.uri()));
let result = Tool::default().execute(req).await;

assert!(matches!(result, Err(FetchError::BlockedUrl)));
}

#[tokio::test]
async fn test_ssrf_010_docs_site_llms_txt_enforces_same_host_redirect_policy() {
let mock_server = MockServer::start().await;
let server_addr = mock_server.address();
let final_llms_url = format!("http://127.0.0.1:{}/final-llms.txt", server_addr.port());

Mock::given(method("GET"))
.and(path("/llms.txt"))
.respond_with(ResponseTemplate::new(302).insert_header("Location", &final_llms_url))
.mount(&mock_server)
.await;

Mock::given(method("GET"))
.and(path("/final-llms.txt"))
.respond_with(
ResponseTemplate::new(200)
.set_body_string("Redirected docs for agents")
.insert_header("content-type", "text/plain"),
)
.mount(&mock_server)
.await;

let tool = Tool::builder()
.block_private_ips(false)
.same_host_redirects_only(true)
.build();
let req = FetchRequest::new(format!("http://localhost:{}/llms.txt", server_addr.port()));
let result = tool.execute(req).await;

assert!(matches!(result, Err(FetchError::BlockedUrl)));
}

// ============================================================================
// TM-NET-004: Ambient proxy environment variables
// ============================================================================
Expand Down
Loading