Skip to content

Commit 8230b11

Browse files
authored
fix(shuf): cap range and repeat output allocation
## What Fix `shuf` so attacker-controlled numeric ranges and repeat counts cannot force unbounded in-process allocation. ## Why `shuf -i LO-HI -n N` materialized the full range before applying `-n`, and `shuf -r -n N` collected all repeated output before interpreter stdout truncation. Huge `u64` inputs could exhaust CPU or memory. ## How - Represent numeric ranges separately from materialized line input. - Sample bounded range output directly without collecting the full range. - Check repeat/range output against `ExecutionLimits` before allocation. - Add regression tests for huge range `-n 1` and repeat output caps. - Document the threat as `TM-DOS-090`. ## Risk - Low - `shuf` now returns an explicit error when requested output exceeds the execution output limit instead of relying on post-command truncation. ## Checklist - [x] Tests added or updated - [x] Backward compatibility considered Validation: - `cargo fmt --check` - `CARGO_INCREMENTAL=0 cargo clippy --all-targets -- -D warnings` - `CARGO_INCREMENTAL=0 cargo test` - `CARGO_INCREMENTAL=0 just vet` - CLI smoke: huge range `-n 1` returns one line - CLI smoke: oversized repeat exits 1 with `output too large`
1 parent ef72d77 commit 8230b11

3 files changed

Lines changed: 264 additions & 44 deletions

File tree

crates/bashkit/src/builtins/shuf.rs

Lines changed: 232 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,13 @@
44
//! `bashkit-coreutils-port` codegen tool — see `generated/shuf_args.rs`
55
//! and `crates/bashkit-coreutils-port/`. Behaviour is implemented locally
66
//! against the bashkit VFS.
7+
//!
8+
//! Resource decision: shuf builds an ExecResult/String in-process, so it must
9+
//! reject outputs that exceed ExecutionLimits before allocation and must never
10+
//! materialize numeric ranges only to apply `-n` afterward.
711
812
use async_trait::async_trait;
13+
use std::collections::HashMap;
914
use std::ffi::OsString;
1015
use std::ops::RangeInclusive;
1116
use std::path::Path;
@@ -14,6 +19,7 @@ use super::generated::shuf_args::shuf_command;
1419
use super::{Builtin, Context, read_text_file};
1520
use crate::error::Result;
1621
use crate::interpreter::ExecResult;
22+
use crate::limits::ExecutionLimits;
1723

1824
pub struct Shuf;
1925

@@ -71,10 +77,10 @@ impl Builtin for Shuf {
7177

7278
let separator = if zero_terminated { '\0' } else { '\n' };
7379

74-
let items: Vec<String> = if echo {
75-
positionals
80+
let input = if echo {
81+
ShufInput::Items(positionals)
7682
} else if let Some(range) = input_range {
77-
range.map(|n| n.to_string()).collect()
83+
ShufInput::Range(range)
7884
} else {
7985
// Reading lines: positional file path, "-", or absent (stdin).
8086
let raw = match positionals.first() {
@@ -92,53 +98,22 @@ impl Builtin for Shuf {
9298
}
9399
}
94100
};
95-
split_separated(&raw, separator)
101+
ShufInput::Items(split_separated(&raw, separator))
96102
};
97103

104+
// THREAT[TM-DOS-090]: Bind shuf's in-process output construction to
105+
// ExecutionLimits before range/repeat loops can allocate.
106+
let output_limit = shuf_output_limit(&ctx, output_path.is_some());
98107
let mut rng = SmallRng::seed_from_now();
99-
100-
let output_lines: Vec<String> = if repeat {
101-
// -r samples *with* replacement: each pick is independent.
102-
// GNU shuf -r without -n loops forever; bashkit caps at 1
103-
// and requires -n, mirroring the safe behavior an embedded
104-
// VFS shell needs.
105-
let count = match head_count {
106-
Some(n) => n,
107-
None => {
108-
return Ok(ExecResult::err(
109-
"shuf: --repeat requires --head-count to be finite\n".to_string(),
110-
1,
111-
));
112-
}
113-
};
114-
if items.is_empty() {
115-
return Ok(ExecResult::err(
116-
"shuf: no lines to repeat from\n".to_string(),
117-
1,
118-
));
119-
}
120-
(0..count)
121-
.map(|_| items[rng.next_usize_lt(items.len())].clone())
122-
.collect()
108+
let out = match if repeat {
109+
render_repeat(input, head_count, separator, output_limit, &mut rng)
123110
} else {
124-
// Without -r: Fisher-Yates shuffle, then truncate to -n.
125-
let mut v = items;
126-
for i in (1..v.len()).rev() {
127-
let j = rng.next_usize_lt(i + 1);
128-
v.swap(i, j);
129-
}
130-
if let Some(n) = head_count {
131-
v.truncate(n as usize);
132-
}
133-
v
111+
render_non_repeat(input, head_count, separator, output_limit, &mut rng)
112+
} {
113+
Ok(out) => out,
114+
Err(stderr) => return Ok(ExecResult::err(stderr, 1)),
134115
};
135116

136-
let mut out = String::with_capacity(output_lines.iter().map(String::len).sum::<usize>());
137-
for line in &output_lines {
138-
out.push_str(line);
139-
out.push(separator);
140-
}
141-
142117
if let Some(path) = output_path {
143118
let resolved = if path.is_absolute() {
144119
path.clone()
@@ -171,6 +146,208 @@ fn split_separated(raw: &str, sep: char) -> Vec<String> {
171146
out
172147
}
173148

149+
enum ShufInput {
150+
Items(Vec<String>),
151+
Range(RangeInclusive<u64>),
152+
}
153+
154+
fn shuf_output_limit(ctx: &Context<'_>, output_to_file: bool) -> usize {
155+
let exec_limit = ctx
156+
.execution_extension::<ExecutionLimits>()
157+
.map(|limits| limits.max_stdout_bytes)
158+
.unwrap_or_else(|| ExecutionLimits::default().max_stdout_bytes);
159+
160+
if !output_to_file {
161+
return exec_limit;
162+
}
163+
164+
let fs_limits = ctx.fs.limits();
165+
exec_limit
166+
.min(u64_to_usize_saturating(fs_limits.max_file_size))
167+
.min(u64_to_usize_saturating(fs_limits.max_total_bytes))
168+
}
169+
170+
fn render_repeat(
171+
input: ShufInput,
172+
head_count: Option<u64>,
173+
separator: char,
174+
output_limit: usize,
175+
rng: &mut SmallRng,
176+
) -> std::result::Result<String, String> {
177+
// -r samples *with* replacement: each pick is independent.
178+
// GNU shuf -r without -n loops forever; bashkit requires -n because
179+
// an embedded VFS shell needs a finite output contract.
180+
let count = match head_count {
181+
Some(n) => n,
182+
None => {
183+
return Err("shuf: --repeat requires --head-count to be finite\n".to_string());
184+
}
185+
};
186+
187+
match input {
188+
ShufInput::Items(items) => {
189+
if items.is_empty() {
190+
return Err("shuf: no lines to repeat from\n".to_string());
191+
}
192+
let max_line_len = items
193+
.iter()
194+
.map(String::len)
195+
.max()
196+
.unwrap_or(0)
197+
.saturating_add(separator.len_utf8());
198+
ensure_output_fits(count as u128, max_line_len as u128, output_limit)?;
199+
200+
let mut out =
201+
String::with_capacity(repeat_capacity(count, max_line_len, output_limit)?);
202+
for _ in 0..count {
203+
out.push_str(&items[rng.next_usize_lt(items.len())]);
204+
out.push(separator);
205+
}
206+
Ok(out)
207+
}
208+
ShufInput::Range(range) => {
209+
let Some((start, end, len)) = range_parts(&range) else {
210+
return Err("shuf: no lines to repeat from\n".to_string());
211+
};
212+
let max_line_len = max_range_value_len(start, end).saturating_add(separator.len_utf8());
213+
ensure_output_fits(count as u128, max_line_len as u128, output_limit)?;
214+
215+
let mut out =
216+
String::with_capacity(repeat_capacity(count, max_line_len, output_limit)?);
217+
for _ in 0..count {
218+
let offset = rng.next_u128_lt(len);
219+
out.push_str(&(start as u128 + offset).to_string());
220+
out.push(separator);
221+
}
222+
Ok(out)
223+
}
224+
}
225+
}
226+
227+
fn render_non_repeat(
228+
input: ShufInput,
229+
head_count: Option<u64>,
230+
separator: char,
231+
output_limit: usize,
232+
rng: &mut SmallRng,
233+
) -> std::result::Result<String, String> {
234+
match input {
235+
ShufInput::Items(mut items) => {
236+
for i in (1..items.len()).rev() {
237+
let j = rng.next_usize_lt(i + 1);
238+
items.swap(i, j);
239+
}
240+
if let Some(n) = head_count {
241+
items.truncate(u64_to_usize_saturating(n));
242+
}
243+
render_items(items, separator, output_limit)
244+
}
245+
ShufInput::Range(range) => {
246+
let Some((start, end, len)) = range_parts(&range) else {
247+
return Ok(String::new());
248+
};
249+
let output_count = head_count.map(u128::from).unwrap_or(len).min(len);
250+
let max_line_len = max_range_value_len(start, end).saturating_add(separator.len_utf8());
251+
ensure_output_fits(output_count, max_line_len as u128, output_limit)?;
252+
253+
let count =
254+
usize::try_from(output_count).map_err(|_| output_too_large(output_limit))?;
255+
let values = sample_range_without_replacement(start, len, count, rng);
256+
render_items(values, separator, output_limit)
257+
}
258+
}
259+
}
260+
261+
fn render_items(
262+
items: Vec<String>,
263+
separator: char,
264+
output_limit: usize,
265+
) -> std::result::Result<String, String> {
266+
let output_len =
267+
items_output_len(&items, separator).ok_or_else(|| output_too_large(output_limit))?;
268+
if output_len > output_limit {
269+
return Err(output_too_large(output_limit));
270+
}
271+
272+
let mut out = String::with_capacity(output_len);
273+
for line in &items {
274+
out.push_str(line);
275+
out.push(separator);
276+
}
277+
Ok(out)
278+
}
279+
280+
fn sample_range_without_replacement(
281+
start: u64,
282+
len: u128,
283+
count: usize,
284+
rng: &mut SmallRng,
285+
) -> Vec<String> {
286+
let mut swaps: HashMap<u128, u128> = HashMap::with_capacity(count.saturating_mul(2));
287+
let mut out = Vec::with_capacity(count);
288+
for i in 0..count as u128 {
289+
let j = i + rng.next_u128_lt(len - i);
290+
let selected = *swaps.get(&j).unwrap_or(&j);
291+
let replacement = *swaps.get(&i).unwrap_or(&i);
292+
swaps.insert(j, replacement);
293+
out.push((start as u128 + selected).to_string());
294+
}
295+
out
296+
}
297+
298+
fn range_parts(range: &RangeInclusive<u64>) -> Option<(u64, u64, u128)> {
299+
let start = *range.start();
300+
let end = *range.end();
301+
if start > end {
302+
return None;
303+
}
304+
Some((start, end, u128::from(end - start) + 1))
305+
}
306+
307+
fn items_output_len(items: &[String], separator: char) -> Option<usize> {
308+
let sep_len = separator.len_utf8();
309+
items.iter().try_fold(0usize, |sum, item| {
310+
sum.checked_add(item.len())?.checked_add(sep_len)
311+
})
312+
}
313+
314+
fn ensure_output_fits(
315+
count: u128,
316+
max_line_len: u128,
317+
output_limit: usize,
318+
) -> std::result::Result<(), String> {
319+
let Some(max_output_len) = count.checked_mul(max_line_len) else {
320+
return Err(output_too_large(output_limit));
321+
};
322+
if max_output_len > output_limit as u128 {
323+
return Err(output_too_large(output_limit));
324+
}
325+
Ok(())
326+
}
327+
328+
fn repeat_capacity(
329+
count: u64,
330+
max_line_len: usize,
331+
output_limit: usize,
332+
) -> std::result::Result<usize, String> {
333+
let capacity = (count as u128)
334+
.checked_mul(max_line_len as u128)
335+
.ok_or_else(|| output_too_large(output_limit))?;
336+
usize::try_from(capacity).map_err(|_| output_too_large(output_limit))
337+
}
338+
339+
fn max_range_value_len(start: u64, end: u64) -> usize {
340+
start.to_string().len().max(end.to_string().len())
341+
}
342+
343+
fn u64_to_usize_saturating(n: u64) -> usize {
344+
usize::try_from(n).unwrap_or(usize::MAX)
345+
}
346+
347+
fn output_too_large(output_limit: usize) -> String {
348+
format!("shuf: output too large (limit {output_limit} bytes)\n")
349+
}
350+
174351
// Note: range parsing is handled by `parse_range` inlined into the
175352
// generated `shuf_args.rs` (codegen copies it from uutils' source).
176353
// We don't need a second parser here; clap returns the parsed
@@ -228,6 +405,17 @@ impl SmallRng {
228405
}
229406
}
230407
}
408+
409+
fn next_u128_lt(&mut self, bound: u128) -> u128 {
410+
debug_assert!(bound > 0);
411+
loop {
412+
let value = (u128::from(self.next_u64()) << 64) | u128::from(self.next_u64());
413+
let zone = u128::MAX - (u128::MAX % bound);
414+
if value < zone {
415+
return value % bound;
416+
}
417+
}
418+
}
231419
}
232420

233421
#[cfg(test)]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
use bashkit::{Bash, ExecutionLimits};
2+
3+
#[tokio::test]
4+
async fn shuf_range_head_count_does_not_materialize_full_range() {
5+
let limits = ExecutionLimits::new().max_stdout_bytes(64);
6+
let mut bash = Bash::builder().limits(limits).build();
7+
8+
let result = bash
9+
.exec("shuf -i 1-18446744073709551615 -n 1")
10+
.await
11+
.unwrap();
12+
13+
assert_eq!(result.exit_code, 0);
14+
assert!(!result.stdout_truncated);
15+
let value = result.stdout.trim().parse::<u64>().unwrap();
16+
assert!(value >= 1);
17+
}
18+
19+
#[tokio::test]
20+
async fn shuf_repeat_head_count_is_checked_before_output_allocation() {
21+
let limits = ExecutionLimits::new().max_stdout_bytes(16);
22+
let mut bash = Bash::builder().limits(limits).build();
23+
24+
let result = bash.exec("shuf -r -e x -n 50").await.unwrap();
25+
26+
assert_eq!(result.exit_code, 1);
27+
assert!(!result.stdout_truncated);
28+
assert!(result.stdout.is_empty());
29+
assert!(result.stderr.contains("output too large"));
30+
}

specs/threat-model.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1339,6 +1339,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track
13391339
| ~~TM-DOS-044~~ | ~~Nested `$()` stack overflow (regression)~~ | ~~Process crash (SIGABRT) at depth ~50 despite #492 fix~~ | ~~Interpreter execution path may need separate depth tracking from lexer fix~~ — depth-50 nested-subst test (`finding_nested_cmd_subst_stack_overflow::depth_50_is_bounded`) passes (**FIXED**) |
13401340
| TM-DOS-088 | Command substitution OOM via state cloning | OOM at depth N (memory ≈ N × state_size) | Dedicated `max_subst_depth` limit (default 32), separate from `max_function_depth`**FIXED** via #1088 |
13411341
| TM-DOS-089 | Command substitution stack overflow via inlined futures | SIGABRT at ~20-30 nested $() levels | Box::pin `expand_word` and `execute_cmd_subst` to cap per-level stack — **FIXED** via #1089 |
1342+
| ~~TM-DOS-090~~ | ~~`shuf` unbounded range/repeat materialization~~ | ~~OOM/CPU exhaustion via huge `--input-range` or `--head-count` before stdout truncation~~ | ~~Sample numeric ranges without full collection and reject output that exceeds `ExecutionLimits` before allocation~~`shuf_resource_tests` cover huge range `-n 1` and repeat output caps (**FIXED**) |
13421343

13431344
### Accepted (Low Priority)
13441345

@@ -1370,6 +1371,7 @@ This section maps former vulnerability IDs to the new threat ID scheme and track
13701371
| Arithmetic depth limit (50) | TM-DOS-026 | `interpreter/mod.rs` | Yes |
13711372
| Builtin parser depth limit (100) | TM-DOS-027 | `builtins/awk.rs`, `builtins/jq/` | Yes |
13721373
| Execution timeout (30s) | TM-DOS-023 | `limits.rs` | Yes |
1374+
| Builtin output pre-allocation caps | TM-DOS-058, TM-DOS-090 | `limits.rs`, `builtins/shuf.rs` | Yes |
13731375
| Virtual filesystem | TM-ESC-001, TM-ESC-003 | `fs/memory.rs` | Yes |
13741376
| Filesystem limits | TM-DOS-005 to TM-DOS-010, TM-DOS-014 | `fs/limits.rs` | Yes |
13751377
| Path depth limit (100) | TM-DOS-012 | `fs/limits.rs` | Yes |

0 commit comments

Comments
 (0)