diff --git a/crates/bashkit/src/fs/limits.rs b/crates/bashkit/src/fs/limits.rs index 7f584c20..f0ba0f63 100644 --- a/crates/bashkit/src/fs/limits.rs +++ b/crates/bashkit/src/fs/limits.rs @@ -16,6 +16,10 @@ //! - **TM-DOS-013**: Long filenames → `max_filename_length`, `max_path_length` //! - **TM-DOS-014**: Many directory entries → `max_file_count` //! - **TM-DOS-015**: Unicode path attacks → `validate_path()` control char rejection +//! - **TM-UNI-003**: Zero-width chars in paths → `find_unsafe_path_char()` rejection +//! - **TM-UNI-011**: Tag-block chars in paths → `find_unsafe_path_char()` rejection +//! - **TM-UNI-012**: Interlinear annotations in paths → `find_unsafe_path_char()` rejection +//! - **TM-UNI-013**: Deprecated format chars in paths → `find_unsafe_path_char()` rejection use std::fmt; use std::path::Path; @@ -343,12 +347,23 @@ impl FsLimits { } // THREAT[TM-DOS-015]: Unicode control chars and bidi overrides can cause path confusion -// Mitigation: Reject paths containing these characters +// THREAT[TM-UNI-003]: Zero-width chars create visually-identical filenames +// THREAT[TM-UNI-011]: Tag block chars (U+E0001-U+E007F) hide content invisibly +// THREAT[TM-UNI-012]: Interlinear annotations (U+FFF9-U+FFFB) hide text +// THREAT[TM-UNI-013]: Deprecated format chars (U+206A-U+206F) cause display confusion +// Mitigation: Reject path components containing any of these invisible/confusable chars /// Check if a path component contains unsafe characters. /// /// Returns `Some(description)` for the first unsafe character found. -/// Rejects: ASCII control chars (0x00-0x1F, 0x7F), C1 controls (0x80-0x9F), -/// and Unicode bidi override characters (U+202A-U+202E, U+2066-U+2069). +/// +/// Rejects: +/// - ASCII control chars (0x00-0x1F, 0x7F) +/// - C1 control characters (U+0080-U+009F) +/// - Bidi override characters (U+202A-U+202E, U+2066-U+2069) +/// - Zero-width characters: U+200B-U+200D, U+2060, U+FEFF, U+180E +/// - Deprecated format characters (U+206A-U+206F) +/// - Interlinear annotation markers (U+FFF9-U+FFFB) +/// - Tag block (U+E0000-U+E007F) fn find_unsafe_path_char(name: &str) -> Option { for ch in name.chars() { // ASCII control characters (except we allow nothing - null is already @@ -364,6 +379,28 @@ fn find_unsafe_path_char(name: &str) -> Option { if ('\u{202A}'..='\u{202E}').contains(&ch) || ('\u{2066}'..='\u{2069}').contains(&ch) { return Some(format!("U+{:04X} (bidi override)", ch as u32)); } + // TM-UNI-003: Zero-width characters - invisible, create confusable names + // U+200B ZWSP, U+200C ZWNJ, U+200D ZWJ, U+2060 Word Joiner, + // U+FEFF BOM/Zero Width No-Break Space, U+180E Mongolian Vowel Separator + if matches!( + ch, + '\u{200B}' | '\u{200C}' | '\u{200D}' | '\u{2060}' | '\u{FEFF}' | '\u{180E}' + ) { + return Some(format!("U+{:04X} (zero-width)", ch as u32)); + } + // TM-UNI-013: Deprecated format characters - display confusion + if ('\u{206A}'..='\u{206F}').contains(&ch) { + return Some(format!("U+{:04X} (deprecated format)", ch as u32)); + } + // TM-UNI-012: Interlinear annotation markers - hide text + if ('\u{FFF9}'..='\u{FFFB}').contains(&ch) { + return Some(format!("U+{:04X} (interlinear annotation)", ch as u32)); + } + // TM-UNI-011: Tag block - invisible chars (deprecated in Unicode 5.0) + // Range covers U+E0000 LANGUAGE TAG and U+E0001-U+E007F (TAG ASCII chars) + if ('\u{E0000}'..='\u{E007F}').contains(&ch) { + return Some(format!("U+{:04X} (tag char)", ch as u32)); + } } None } @@ -661,4 +698,111 @@ mod tests { assert!(limits.validate_path(Path::new("/tmp/café")).is_ok()); assert!(limits.validate_path(Path::new("/tmp/文件")).is_ok()); } + + // TM-UNI-003: Zero-width characters in filenames must be rejected + #[test] + fn test_validate_path_zwsp_rejected() { + let limits = FsLimits::new(); + let path = PathBuf::from("/tmp/file\u{200B}name.txt"); + let err = limits.validate_path(&path).unwrap_err(); + let msg = err.to_string(); + assert!(msg.contains("zero-width"), "got: {msg}"); + assert!(msg.contains("U+200B"), "got: {msg}"); + } + + #[test] + fn test_validate_path_zwnj_rejected() { + let limits = FsLimits::new(); + assert!(limits.validate_path(Path::new("/tmp/a\u{200C}b")).is_err()); + } + + #[test] + fn test_validate_path_zwj_rejected() { + let limits = FsLimits::new(); + assert!(limits.validate_path(Path::new("/tmp/a\u{200D}b")).is_err()); + } + + #[test] + fn test_validate_path_word_joiner_rejected() { + let limits = FsLimits::new(); + assert!(limits.validate_path(Path::new("/tmp/a\u{2060}b")).is_err()); + } + + #[test] + fn test_validate_path_bom_rejected() { + let limits = FsLimits::new(); + assert!( + limits + .validate_path(Path::new("/tmp/\u{FEFF}file")) + .is_err() + ); + } + + #[test] + fn test_validate_path_mongolian_vowel_separator_rejected() { + let limits = FsLimits::new(); + assert!(limits.validate_path(Path::new("/tmp/a\u{180E}b")).is_err()); + } + + // TM-UNI-013: Deprecated format characters (U+206A-U+206F) must be rejected + #[test] + fn test_validate_path_deprecated_format_rejected() { + let limits = FsLimits::new(); + for ch in '\u{206A}'..='\u{206F}' { + let path = PathBuf::from(format!("/tmp/a{ch}b")); + let err = limits.validate_path(&path).unwrap_err(); + assert!( + err.to_string().contains("deprecated format"), + "U+{:04X}: {}", + ch as u32, + err + ); + } + } + + // TM-UNI-012: Interlinear annotation markers (U+FFF9-U+FFFB) must be rejected + #[test] + fn test_validate_path_interlinear_annotation_rejected() { + let limits = FsLimits::new(); + for ch in ['\u{FFF9}', '\u{FFFA}', '\u{FFFB}'] { + let path = PathBuf::from(format!("/tmp/a{ch}b")); + let err = limits.validate_path(&path).unwrap_err(); + assert!( + err.to_string().contains("interlinear annotation"), + "U+{:04X}: {}", + ch as u32, + err + ); + } + } + + // TM-UNI-011: Tag block characters (U+E0000-U+E007F) must be rejected + #[test] + fn test_validate_path_tag_chars_rejected() { + let limits = FsLimits::new(); + // Spot-check the boundary values and a TAG ASCII char + for ch in ['\u{E0000}', '\u{E0001}', '\u{E0041}', '\u{E007F}'] { + let path = PathBuf::from(format!("/tmp/a{ch}b")); + let err = limits.validate_path(&path).unwrap_err(); + assert!( + err.to_string().contains("tag char"), + "U+{:04X}: {}", + ch as u32, + err + ); + } + } + + // Adjacent chars to the new ranges must NOT be over-blocked. + #[test] + fn test_validate_path_adjacent_chars_allowed() { + let limits = FsLimits::new(); + // U+200A HAIR SPACE — visible whitespace, just below ZWSP + assert!(limits.validate_path(Path::new("/tmp/a\u{200A}b")).is_ok()); + // U+200E LRM, U+200F RLM — bidi marks (not overrides) + assert!(limits.validate_path(Path::new("/tmp/a\u{200E}b")).is_ok()); + assert!(limits.validate_path(Path::new("/tmp/a\u{200F}b")).is_ok()); + // U+2070 SUPERSCRIPT ZERO — just past the deprecated-format range + assert!(limits.validate_path(Path::new("/tmp/a\u{2070}b")).is_ok()); + } } diff --git a/crates/bashkit/tests/unicode_security_tests.rs b/crates/bashkit/tests/unicode_security_tests.rs index 2bed3f15..83022394 100644 --- a/crates/bashkit/tests/unicode_security_tests.rs +++ b/crates/bashkit/tests/unicode_security_tests.rs @@ -166,59 +166,66 @@ mod byte_boundary_safety { mod zero_width_chars { use super::*; - /// TM-UNI-003: Zero-width space in filename — documents current behavior. - /// Currently UNMITIGATED: find_unsafe_path_char() does not reject ZWSP. + /// TM-UNI-003: Zero-width space in filename — MITIGATED. + /// `find_unsafe_path_char()` rejects ZWSP, so the visually-identical + /// confusable filename can never reach the filesystem. #[tokio::test] - async fn unicode_zwsp_in_filename_current_behavior() { + async fn unicode_zwsp_in_filename_rejected() { let fs = InMemoryFs::new(); - - // Zero Width Space (U+200B) in filename - let result = fs + let err = fs .write_file(Path::new("/tmp/file\u{200B}name.txt"), b"data") - .await; + .await + .expect_err("ZWSP must be rejected"); + let msg = err.to_string(); + assert!( + msg.contains("U+200B") || msg.contains("zero-width") || msg.contains("unsafe"), + "got: {msg}" + ); + } - // Currently this succeeds — documents the gap. - // When TM-UNI-003 is fixed, this should return an error. - if result.is_ok() { - // Gap confirmed: zero-width chars pass validation - // Also verify the file is distinguishable from "filename.txt" - let normal = fs - .write_file(Path::new("/tmp/filename.txt"), b"other") - .await; - assert!(normal.is_ok()); - // Two distinct files exist with visually identical names - let content1 = fs - .read_file(Path::new("/tmp/file\u{200B}name.txt")) - .await - .unwrap(); - let content2 = fs.read_file(Path::new("/tmp/filename.txt")).await.unwrap(); - assert_ne!( - content1, content2, - "ZWSP creates distinct file (TM-UNI-003 gap)" - ); - } - // If it fails, the mitigation has been implemented + /// TM-UNI-003: BOM (U+FEFF) in filename — MITIGATED. + #[tokio::test] + async fn unicode_bom_in_filename_rejected() { + let fs = InMemoryFs::new(); + fs.write_file(Path::new("/tmp/\u{FEFF}file.txt"), b"data") + .await + .expect_err("BOM must be rejected"); } - /// TM-UNI-003: BOM (U+FEFF) in filename — documents current behavior + /// TM-UNI-003: ZWJ (U+200D) in filename — MITIGATED. #[tokio::test] - async fn unicode_bom_in_filename_current_behavior() { + async fn unicode_zwj_in_filename_rejected() { let fs = InMemoryFs::new(); - let result = fs - .write_file(Path::new("/tmp/\u{FEFF}file.txt"), b"data") - .await; - // Documents whether BOM is caught or not - let _ = result; + fs.write_file(Path::new("/tmp/file\u{200D}name.txt"), b"data") + .await + .expect_err("ZWJ must be rejected"); } - /// TM-UNI-003: ZWJ (U+200D) in filename — documents current behavior + /// TM-UNI-011: Tag block char (U+E0041 = TAG LATIN A) — MITIGATED. #[tokio::test] - async fn unicode_zwj_in_filename_current_behavior() { + async fn unicode_tag_char_in_filename_rejected() { let fs = InMemoryFs::new(); - let result = fs - .write_file(Path::new("/tmp/file\u{200D}name.txt"), b"data") - .await; - let _ = result; + fs.write_file(Path::new("/tmp/file\u{E0041}name.txt"), b"data") + .await + .expect_err("tag char must be rejected"); + } + + /// TM-UNI-012: Interlinear annotation marker (U+FFF9) — MITIGATED. + #[tokio::test] + async fn unicode_interlinear_annotation_in_filename_rejected() { + let fs = InMemoryFs::new(); + fs.write_file(Path::new("/tmp/file\u{FFF9}name.txt"), b"data") + .await + .expect_err("interlinear annotation must be rejected"); + } + + /// TM-UNI-013: Deprecated format char (U+206C) — MITIGATED. + #[tokio::test] + async fn unicode_deprecated_format_in_filename_rejected() { + let fs = InMemoryFs::new(); + fs.write_file(Path::new("/tmp/file\u{206C}name.txt"), b"data") + .await + .expect_err("deprecated format char must be rejected"); } /// TM-UNI-004: Zero-width chars in variable names — pass-through is correct @@ -391,41 +398,31 @@ mod combining_char_tests { mod invisible_char_tests { use super::*; - /// TM-UNI-011: Tag characters in filename — documents current behavior + /// TM-UNI-011: U+E0001 LANGUAGE TAG — MITIGATED. #[tokio::test] - async fn unicode_tag_chars_in_filename_current_behavior() { + async fn unicode_tag_chars_in_filename_rejected() { let fs = InMemoryFs::new(); - - // U+E0001 (Language Tag) — invisible, deprecated since Unicode 5.0 - let result = fs - .write_file(Path::new("/tmp/file\u{E0001}name.txt"), b"data") - .await; - // Currently UNMITIGATED — documents the gap - let _ = result; + fs.write_file(Path::new("/tmp/file\u{E0001}name.txt"), b"data") + .await + .expect_err("U+E0001 LANGUAGE TAG must be rejected"); } - /// TM-UNI-012: Interlinear annotation chars in filename — documents current behavior + /// TM-UNI-012: U+FFF9 INTERLINEAR ANNOTATION ANCHOR — MITIGATED. #[tokio::test] - async fn unicode_interlinear_annotation_in_filename() { + async fn unicode_interlinear_annotation_in_filename_rejected() { let fs = InMemoryFs::new(); - - // U+FFF9 (Interlinear Annotation Anchor) - let result = fs - .write_file(Path::new("/tmp/file\u{FFF9}name.txt"), b"data") - .await; - let _ = result; + fs.write_file(Path::new("/tmp/file\u{FFF9}name.txt"), b"data") + .await + .expect_err("U+FFF9 INTERLINEAR ANNOTATION ANCHOR must be rejected"); } - /// TM-UNI-013: Deprecated format chars in filename — documents current behavior + /// TM-UNI-013: U+206A INHIBIT SYMMETRIC SWAPPING — MITIGATED. #[tokio::test] - async fn unicode_deprecated_format_chars_in_filename() { + async fn unicode_deprecated_format_chars_in_filename_rejected() { let fs = InMemoryFs::new(); - - // U+206A (Inhibit Symmetric Swapping) — deprecated - let result = fs - .write_file(Path::new("/tmp/file\u{206A}name.txt"), b"data") - .await; - let _ = result; + fs.write_file(Path::new("/tmp/file\u{206A}name.txt"), b"data") + .await + .expect_err("U+206A deprecated format must be rejected"); } } diff --git a/specs/threat-model.md b/specs/threat-model.md index 47cb6276..f40f12f7 100644 --- a/specs/threat-model.md +++ b/specs/threat-model.md @@ -1260,8 +1260,8 @@ This section maps former vulnerability IDs to the new threat ID scheme and track | TM-DOS-040 | Integer truncation on 32-bit | Size check bypass | Use try_from() | | TM-UNI-001 | Awk parser byte-boundary panic on Unicode | Silent builtin failure on valid input | Fix awk parser to use char-boundary-safe indexing | | TM-UNI-002 | Sed parser byte-boundary issues | Silent builtin failure on valid input | Audit and fix sed byte-indexing | -| TM-UNI-003 | Zero-width chars in filenames | Invisible/confusable filenames | Extend `find_unsafe_path_char()` | -| TM-UNI-011 | Tag characters in filenames | Invisible content in filenames | Extend `find_unsafe_path_char()` | +| ~~TM-UNI-003~~ | ~~Zero-width chars in filenames~~ | ~~Invisible/confusable filenames~~ | ~~Extend `find_unsafe_path_char()`~~ (**FIXED**) | +| ~~TM-UNI-011~~ | ~~Tag characters in filenames~~ | ~~Invisible content in filenames~~ | ~~Extend `find_unsafe_path_char()`~~ (**FIXED**) | | TM-UNI-015 | Expr `substr` byte-boundary panic | Silent failure on multi-byte substr | Fix to use char-boundary-safe indexing (issue #434) | | TM-UNI-016 | Printf precision mid-char panic | Silent failure on multi-byte precision truncation | Use `is_char_boundary()` before slicing (issue #435) | | TM-UNI-017 | Cut/tr byte-level char set parsing | Multi-byte chars silently dropped in tr specs | Switch from `as_bytes()` to char iteration (issue #436) | @@ -1974,7 +1974,7 @@ The `logging_impl.rs:truncate()` function demonstrates the correct pattern using | ID | Threat | Attack Vector | Mitigation | Status | |----|--------|--------------|------------|--------| -| TM-UNI-003 | Zero-width chars in filenames | `touch "/tmp/file\u{200B}name"` — invisible ZWSP creates confusable filenames | `find_unsafe_path_char()` does NOT detect zero-width chars | **UNMITIGATED** | +| TM-UNI-003 | Zero-width chars in filenames | `touch "/tmp/file\u{200B}name"` — invisible ZWSP creates confusable filenames | `find_unsafe_path_char()` rejects U+200B-U+200D, U+2060, U+FEFF, U+180E in path components | **MITIGATED** | | TM-UNI-004 | Zero-width chars in variable names | `\u{200B}PATH=malicious` — invisible char makes variable look like PATH | Not detected; Bash itself allows this | **ACCEPTED** | | TM-UNI-005 | Zero-width chars in script source | `echo "pass\u{200B}word"` — invisible char in string literal | Not detected; pass-through is correct Bash behavior | **ACCEPTED** | @@ -1989,9 +1989,9 @@ The `logging_impl.rs:truncate()` function demonstrates the correct pattern using - U+2060 Word Joiner - U+180E Mongolian Vowel Separator -**Recommendation**: Extend `find_unsafe_path_char()` to reject zero-width characters in -filenames (TM-UNI-003). Variable names and script content should pass through as-is to -match Bash behavior. +**Status**: TM-UNI-003 is **MITIGATED**. `find_unsafe_path_char()` (`crates/bashkit/src/fs/limits.rs`) +rejects U+200B-U+200D, U+2060, U+FEFF, and U+180E in path components. Variable names and +script content still pass through as-is to match Bash behavior (TM-UNI-004, TM-UNI-005). ### 11.3 Homoglyph / Confusable Characters @@ -2034,18 +2034,21 @@ real Bash on Linux. | ID | Threat | Attack Vector | Mitigation | Status | |----|--------|--------------|------------|--------| -| TM-UNI-011 | Tag character hiding | U+E0001-U+E007F (Tags block) — invisible chars that can conceal content in filenames | `find_unsafe_path_char()` does NOT detect tag chars | **UNMITIGATED** | -| TM-UNI-012 | Interlinear annotation hiding | U+FFF9-U+FFFB (Interlinear Annotations) — can hide text in filenames | Not detected in paths | **UNMITIGATED** | -| TM-UNI-013 | Deprecated format chars | U+206A-U+206F (Deprecated formatting) — can cause display confusion | Not detected in paths | **UNMITIGATED** | - -**Current Risk**: LOW — These are extremely obscure. Tag characters were deprecated in -Unicode 5.0. Practical exploitation likelihood is minimal. - -**Recommendation**: Extend `find_unsafe_path_char()` to also reject: -- U+200B-U+200D, U+2060, U+FEFF (zero-width chars, per TM-UNI-003) -- U+E0001-U+E007F (tag characters) -- U+FFF9-U+FFFB (interlinear annotations) -- U+206A-U+206F (deprecated format characters) +| TM-UNI-011 | Tag character hiding | U+E0001-U+E007F (Tags block) — invisible chars that can conceal content in filenames | `find_unsafe_path_char()` rejects U+E0000-U+E007F in path components | **MITIGATED** | +| TM-UNI-012 | Interlinear annotation hiding | U+FFF9-U+FFFB (Interlinear Annotations) — can hide text in filenames | `find_unsafe_path_char()` rejects U+FFF9-U+FFFB in path components | **MITIGATED** | +| TM-UNI-013 | Deprecated format chars | U+206A-U+206F (Deprecated formatting) — can cause display confusion | `find_unsafe_path_char()` rejects U+206A-U+206F in path components | **MITIGATED** | + +**Status**: All three are **MITIGATED**. `find_unsafe_path_char()` rejects every +documented invisible/confusable range in path components: +- U+200B-U+200D, U+2060, U+FEFF, U+180E (zero-width, TM-UNI-003) +- U+202A-U+202E, U+2066-U+2069 (bidi overrides, TM-DOS-015) +- U+206A-U+206F (deprecated format, TM-UNI-013) +- U+FFF9-U+FFFB (interlinear annotations, TM-UNI-012) +- U+E0000-U+E007F (tag block, TM-UNI-011) + +Variable names, script content, and command output are unaffected — pass-through there +matches Bash behavior. Caller-side display sanitization (see Caller Responsibilities) +remains useful defense-in-depth. ### 11.7 Bidi in Script Source @@ -2142,7 +2145,7 @@ specs are rare in practice). |----|--------|------|--------|--------| | TM-UNI-001 | Awk parser byte-boundary panic | MEDIUM | PARTIAL | Fix awk parser indexing (issue #395) | | TM-UNI-002 | Sed parser byte-boundary panic | MEDIUM | PARTIAL | Fix sed byte-indexing patterns | -| TM-UNI-003 | Zero-width chars in filenames | LOW | UNMITIGATED | Extend `find_unsafe_path_char()` | +| TM-UNI-003 | Zero-width chars in filenames | LOW | MITIGATED | `find_unsafe_path_char()` rejects U+200B-U+200D, U+2060, U+FEFF, U+180E | | TM-UNI-004 | Zero-width chars in variables | MINIMAL | ACCEPTED | Matches Bash behavior | | TM-UNI-005 | Zero-width chars in scripts | MINIMAL | ACCEPTED | Correct pass-through | | TM-UNI-006 | Homoglyph filenames | LOW | ACCEPTED | Impractical to fully detect | @@ -2150,9 +2153,9 @@ specs are rare in practice). | TM-UNI-008 | Normalization bypass | LOW | ACCEPTED | Matches Linux FS behavior | | TM-UNI-009 | Excessive combining marks (filenames) | LOW | MITIGATED | Length limits bound damage | | TM-UNI-010 | Excessive combining marks (builtins) | LOW | MITIGATED | Timeout + depth limits | -| TM-UNI-011 | Tag character hiding | LOW | UNMITIGATED | Extend path validation | -| TM-UNI-012 | Interlinear annotation hiding | LOW | UNMITIGATED | Extend path validation | -| TM-UNI-013 | Deprecated format chars | LOW | UNMITIGATED | Extend path validation | +| TM-UNI-011 | Tag character hiding | LOW | MITIGATED | `find_unsafe_path_char()` rejects U+E0000-U+E007F | +| TM-UNI-012 | Interlinear annotation hiding | LOW | MITIGATED | `find_unsafe_path_char()` rejects U+FFF9-U+FFFB | +| TM-UNI-013 | Deprecated format chars | LOW | MITIGATED | `find_unsafe_path_char()` rejects U+206A-U+206F | | TM-UNI-014 | Bidi in script source | LOW | ACCEPTED | Caller responsibility | | TM-UNI-015 | Expr substr byte-boundary panic | MEDIUM | PARTIAL | Fix expr to use char-safe indexing (issue #434) | | TM-UNI-016 | Printf precision mid-char panic | MEDIUM | PARTIAL | Use `is_char_boundary()` (issue #435) |