fix(shuf): cap range and repeat output allocation

chaliy · web-flow · commit 8230b110f664 · 2026-05-08T15:19:47.000-05:00
## What
Fix `shuf` so attacker-controlled numeric ranges and repeat counts
cannot force unbounded in-process allocation.

## Why
`shuf -i LO-HI -n N` materialized the full range before applying `-n`,
and `shuf -r -n N` collected all repeated output before interpreter
stdout truncation. Huge `u64` inputs could exhaust CPU or memory.

## How
- Represent numeric ranges separately from materialized line input.
- Sample bounded range output directly without collecting the full
range.
- Check repeat/range output against `ExecutionLimits` before allocation.
- Add regression tests for huge range `-n 1` and repeat output caps.
- Document the threat as `TM-DOS-090`.

## Risk
- Low
- `shuf` now returns an explicit error when requested output exceeds the
execution output limit instead of relying on post-command truncation.

## Checklist
- [x] Tests added or updated
- [x] Backward compatibility considered

Validation:
- `cargo fmt --check`
- `CARGO_INCREMENTAL=0 cargo clippy --all-targets -- -D warnings`
- `CARGO_INCREMENTAL=0 cargo test`
- `CARGO_INCREMENTAL=0 just vet`
- CLI smoke: huge range `-n 1` returns one line
- CLI smoke: oversized repeat exits 1 with `output too large`
diff --git a/crates/bashkit/src/builtins/shuf.rs b/crates/bashkit/src/builtins/shuf.rs
@@ -4,8 +4,13 @@
 //! `bashkit-coreutils-port` codegen tool — see `generated/shuf_args.rs`
 //! and `crates/bashkit-coreutils-port/`. Behaviour is implemented locally
 //! against the bashkit VFS.
+//!
+//! Resource decision: shuf builds an ExecResult/String in-process, so it must
+//! reject outputs that exceed ExecutionLimits before allocation and must never
+//! materialize numeric ranges only to apply `-n` afterward.
 
 use async_trait::async_trait;
+use std::collections::HashMap;
 use std::ffi::OsString;
 use std::ops::RangeInclusive;
 use std::path::Path;
@@ -14,6 +19,7 @@ use super::generated::shuf_args::shuf_command;
 use super::{Builtin, Context, read_text_file};
 use crate::error::Result;
 use crate::interpreter::ExecResult;
+use crate::limits::ExecutionLimits;
 
 pub struct Shuf;
 
@@ -71,10 +77,10 @@ impl Builtin for Shuf {
 
         let separator = if zero_terminated { '\0' } else { '\n' };
 
-        let items: Vec<String> = if echo {
-            positionals
+        let input = if echo {
+            ShufInput::Items(positionals)
         } else if let Some(range) = input_range {
-            range.map(|n| n.to_string()).collect()
+            ShufInput::Range(range)
         } else {
             // Reading lines: positional file path, "-", or absent (stdin).
             let raw = match positionals.first() {
@@ -92,53 +98,22 @@ impl Builtin for Shuf {
                     }
                 }
             };
-            split_separated(&raw, separator)
+            ShufInput::Items(split_separated(&raw, separator))
         };
 
+        // THREAT[TM-DOS-090]: Bind shuf's in-process output construction to
+        // ExecutionLimits before range/repeat loops can allocate.
+        let output_limit = shuf_output_limit(&ctx, output_path.is_some());
         let mut rng = SmallRng::seed_from_now();
-
-        let output_lines: Vec<String> = if repeat {
-            // -r samples *with* replacement: each pick is independent.
-            // GNU shuf -r without -n loops forever; bashkit caps at 1
-            // and requires -n, mirroring the safe behavior an embedded
-            // VFS shell needs.
-            let count = match head_count {
-                Some(n) => n,
-                None => {
-                    return Ok(ExecResult::err(
-                        "shuf: --repeat requires --head-count to be finite\n".to_string(),
-                        1,
-                    ));
-                }
-            };
-            if items.is_empty() {
-                return Ok(ExecResult::err(
-                    "shuf: no lines to repeat from\n".to_string(),
-                    1,
-                ));
-            }
-            (0..count)
-                .map(|_| items[rng.next_usize_lt(items.len())].clone())
-                .collect()
+        let out = match if repeat {
+            render_repeat(input, head_count, separator, output_limit, &mut rng)
         } else {
-            // Without -r: Fisher-Yates shuffle, then truncate to -n.
-            let mut v = items;
-            for i in (1..v.len()).rev() {
-                let j = rng.next_usize_lt(i + 1);
-                v.swap(i, j);
-            }
-            if let Some(n) = head_count {
-                v.truncate(n as usize);
-            }
-            v
+            render_non_repeat(input, head_count, separator, output_limit, &mut rng)
+        } {
+            Ok(out) => out,
+            Err(stderr) => return Ok(ExecResult::err(stderr, 1)),
         };
 
-        let mut out = String::with_capacity(output_lines.iter().map(String::len).sum::<usize>());
-        for line in &output_lines {
-            out.push_str(line);
-            out.push(separator);
-        }
-
         if let Some(path) = output_path {
             let resolved = if path.is_absolute() {
                 path.clone()
@@ -171,6 +146,208 @@ fn split_separated(raw: &str, sep: char) -> Vec<String> {
     out
 }
 
+enum ShufInput {
+    Items(Vec<String>),
+    Range(RangeInclusive<u64>),
+}
+
+fn shuf_output_limit(ctx: &Context<'_>, output_to_file: bool) -> usize {
+    let exec_limit = ctx
+        .execution_extension::<ExecutionLimits>()
+        .map(|limits| limits.max_stdout_bytes)
+        .unwrap_or_else(|| ExecutionLimits::default().max_stdout_bytes);
+
+    if !output_to_file {
+        return exec_limit;
+    }
+
+    let fs_limits = ctx.fs.limits();
+    exec_limit
+        .min(u64_to_usize_saturating(fs_limits.max_file_size))
+        .min(u64_to_usize_saturating(fs_limits.max_total_bytes))
+}
+
+fn render_repeat(
+    input: ShufInput,
+    head_count: Option<u64>,
+    separator: char,
+    output_limit: usize,
+    rng: &mut SmallRng,
+) -> std::result::Result<String, String> {
+    // -r samples *with* replacement: each pick is independent.
+    // GNU shuf -r without -n loops forever; bashkit requires -n because
+    // an embedded VFS shell needs a finite output contract.
+    let count = match head_count {
+        Some(n) => n,
+        None => {
+            return Err("shuf: --repeat requires --head-count to be finite\n".to_string());
+        }
+    };
+
+    match input {
+        ShufInput::Items(items) => {
+            if items.is_empty() {
+                return Err("shuf: no lines to repeat from\n".to_string());
+            }
+            let max_line_len = items
+                .iter()
+                .map(String::len)
+                .max()
+                .unwrap_or(0)
+                .saturating_add(separator.len_utf8());
+            ensure_output_fits(count as u128, max_line_len as u128, output_limit)?;
+
+            let mut out =
+                String::with_capacity(repeat_capacity(count, max_line_len, output_limit)?);
+            for _ in 0..count {
+                out.push_str(&items[rng.next_usize_lt(items.len())]);
+                out.push(separator);
+            }
+            Ok(out)
+        }
+        ShufInput::Range(range) => {
+            let Some((start, end, len)) = range_parts(&range) else {
+                return Err("shuf: no lines to repeat from\n".to_string());
+            };
+            let max_line_len = max_range_value_len(start, end).saturating_add(separator.len_utf8());
+            ensure_output_fits(count as u128, max_line_len as u128, output_limit)?;
+
+            let mut out =
+                String::with_capacity(repeat_capacity(count, max_line_len, output_limit)?);
+            for _ in 0..count {
+                let offset = rng.next_u128_lt(len);
+                out.push_str(&(start as u128 + offset).to_string());
+                out.push(separator);
+            }
+            Ok(out)
+        }
+    }
+}
+
+fn render_non_repeat(
+    input: ShufInput,
+    head_count: Option<u64>,
+    separator: char,
+    output_limit: usize,
+    rng: &mut SmallRng,
+) -> std::result::Result<String, String> {
+    match input {
+        ShufInput::Items(mut items) => {
+            for i in (1..items.len()).rev() {
+                let j = rng.next_usize_lt(i + 1);
+                items.swap(i, j);
+            }
+            if let Some(n) = head_count {
+                items.truncate(u64_to_usize_saturating(n));
+            }
+            render_items(items, separator, output_limit)
+        }
+        ShufInput::Range(range) => {
+            let Some((start, end, len)) = range_parts(&range) else {
+                return Ok(String::new());
+            };
+            let output_count = head_count.map(u128::from).unwrap_or(len).min(len);
+            let max_line_len = max_range_value_len(start, end).saturating_add(separator.len_utf8());
+            ensure_output_fits(output_count, max_line_len as u128, output_limit)?;
+
+            let count =
+                usize::try_from(output_count).map_err(|_| output_too_large(output_limit))?;
+            let values = sample_range_without_replacement(start, len, count, rng);
+            render_items(values, separator, output_limit)
+        }
+    }
+}
+
+fn render_items(
+    items: Vec<String>,
+    separator: char,
+    output_limit: usize,
+) -> std::result::Result<String, String> {
+    let output_len =
+        items_output_len(&items, separator).ok_or_else(|| output_too_large(output_limit))?;
+    if output_len > output_limit {
+        return Err(output_too_large(output_limit));
+    }
+
+    let mut out = String::with_capacity(output_len);
+    for line in &items {
+        out.push_str(line);
+        out.push(separator);
+    }
+    Ok(out)
+}
+
+fn sample_range_without_replacement(
+    start: u64,
+    len: u128,
+    count: usize,
+    rng: &mut SmallRng,
+) -> Vec<String> {
+    let mut swaps: HashMap<u128, u128> = HashMap::with_capacity(count.saturating_mul(2));
+    let mut out = Vec::with_capacity(count);
+    for i in 0..count as u128 {
+        let j = i + rng.next_u128_lt(len - i);
+        let selected = *swaps.get(&j).unwrap_or(&j);
+        let replacement = *swaps.get(&i).unwrap_or(&i);
+        swaps.insert(j, replacement);
+        out.push((start as u128 + selected).to_string());
+    }
+    out
+}
+
+fn range_parts(range: &RangeInclusive<u64>) -> Option<(u64, u64, u128)> {
+    let start = *range.start();
+    let end = *range.end();
+    if start > end {
+        return None;
+    }
+    Some((start, end, u128::from(end - start) + 1))
+}
+
+fn items_output_len(items: &[String], separator: char) -> Option<usize> {
+    let sep_len = separator.len_utf8();
+    items.iter().try_fold(0usize, |sum, item| {
+        sum.checked_add(item.len())?.checked_add(sep_len)
+    })
+}
+
+fn ensure_output_fits(
+    count: u128,
+    max_line_len: u128,
+    output_limit: usize,
+) -> std::result::Result<(), String> {
+    let Some(max_output_len) = count.checked_mul(max_line_len) else {
+        return Err(output_too_large(output_limit));
+    };
+    if max_output_len > output_limit as u128 {
+        return Err(output_too_large(output_limit));
+    }
+    Ok(())
+}
+
+fn repeat_capacity(
+    count: u64,
+    max_line_len: usize,
+    output_limit: usize,
+) -> std::result::Result<usize, String> {
+    let capacity = (count as u128)
+        .checked_mul(max_line_len as u128)
+        .ok_or_else(|| output_too_large(output_limit))?;
+    usize::try_from(capacity).map_err(|_| output_too_large(output_limit))
+}
+
+fn max_range_value_len(start: u64, end: u64) -> usize {
+    start.to_string().len().max(end.to_string().len())
+}
+
+fn u64_to_usize_saturating(n: u64) -> usize {
+    usize::try_from(n).unwrap_or(usize::MAX)
+}
+
+fn output_too_large(output_limit: usize) -> String {
+    format!("shuf: output too large (limit {output_limit} bytes)\n")
+}
+
 // Note: range parsing is handled by `parse_range` inlined into the
 // generated `shuf_args.rs` (codegen copies it from uutils' source).
 // We don't need a second parser here; clap returns the parsed
@@ -228,6 +405,17 @@ impl SmallRng {
             }
         }
     }
+
+    fn next_u128_lt(&mut self, bound: u128) -> u128 {
+        debug_assert!(bound > 0);
+        loop {
+            let value = (u128::from(self.next_u64()) << 64) | u128::from(self.next_u64());
+            let zone = u128::MAX - (u128::MAX % bound);
+            if value < zone {
+                return value % bound;
+            }
+        }
+    }
 }
 
 #[cfg(test)]
diff --git a/crates/bashkit/tests/shuf_resource_tests.rs b/crates/bashkit/tests/shuf_resource_tests.rs
@@ -0,0 +1,30 @@
+use bashkit::{Bash, ExecutionLimits};
+
+#[tokio::test]
+async fn shuf_range_head_count_does_not_materialize_full_range() {
+    let limits = ExecutionLimits::new().max_stdout_bytes(64);
+    let mut bash = Bash::builder().limits(limits).build();
+
+    let result = bash
+        .exec("shuf -i 1-18446744073709551615 -n 1")
+        .await
+        .unwrap();
+
+    assert_eq!(result.exit_code, 0);
+    assert!(!result.stdout_truncated);
+    let value = result.stdout.trim().parse::<u64>().unwrap();
+    assert!(value >= 1);
+}
+
+#[tokio::test]
+async fn shuf_repeat_head_count_is_checked_before_output_allocation() {
+    let limits = ExecutionLimits::new().max_stdout_bytes(16);
+    let mut bash = Bash::builder().limits(limits).build();
+
+    let result = bash.exec("shuf -r -e x -n 50").await.unwrap();
+
+    assert_eq!(result.exit_code, 1);
+    assert!(!result.stdout_truncated);
+    assert!(result.stdout.is_empty());
+    assert!(result.stderr.contains("output too large"));
+}
diff --git a/specs/threat-model.md b/specs/threat-model.md
@@ -1339,6 +1339,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track
 | ~~TM-DOS-044~~ | ~~Nested `$()` stack overflow (regression)~~ | ~~Process crash (SIGABRT) at depth ~50 despite #492 fix~~ | ~~Interpreter execution path may need separate depth tracking from lexer fix~~ — depth-50 nested-subst test (`finding_nested_cmd_subst_stack_overflow::depth_50_is_bounded`) passes (**FIXED**) |
 | TM-DOS-088 | Command substitution OOM via state cloning | OOM at depth N (memory ≈ N × state_size) | Dedicated `max_subst_depth` limit (default 32), separate from `max_function_depth` — **FIXED** via #1088 |
 | TM-DOS-089 | Command substitution stack overflow via inlined futures | SIGABRT at ~20-30 nested $() levels | Box::pin `expand_word` and `execute_cmd_subst` to cap per-level stack — **FIXED** via #1089 |
+| ~~TM-DOS-090~~ | ~~`shuf` unbounded range/repeat materialization~~ | ~~OOM/CPU exhaustion via huge `--input-range` or `--head-count` before stdout truncation~~ | ~~Sample numeric ranges without full collection and reject output that exceeds `ExecutionLimits` before allocation~~ — `shuf_resource_tests` cover huge range `-n 1` and repeat output caps (**FIXED**) |
 
 ### Accepted (Low Priority)
 
@@ -1370,6 +1371,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track
 | Arithmetic depth limit (50) | TM-DOS-026 | `interpreter/mod.rs` | Yes |
 | Builtin parser depth limit (100) | TM-DOS-027 | `builtins/awk.rs`, `builtins/jq/` | Yes |
 | Execution timeout (30s) | TM-DOS-023 | `limits.rs` | Yes |
+| Builtin output pre-allocation caps | TM-DOS-058, TM-DOS-090 | `limits.rs`, `builtins/shuf.rs` | Yes |
 | Virtual filesystem | TM-ESC-001, TM-ESC-003 | `fs/memory.rs` | Yes |
 | Filesystem limits | TM-DOS-005 to TM-DOS-010, TM-DOS-014 | `fs/limits.rs` | Yes |
 | Path depth limit (100) | TM-DOS-012 | `fs/limits.rs` | Yes |