From 904105a1d21fee7364ad8f826199cbdf3fabce08 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 12 Mar 2026 16:44:23 +0000
Subject: [PATCH 1/2] chore[fsst]: add fsst contains benchmarks

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                              |   17 +
 Cargo.toml                              |    5 +
 encodings/fsst/Cargo.toml               |    9 +
 encodings/fsst/benches/fsst_contains.rs | 3650 +++++++++++++++++++++++
 4 files changed, 3681 insertions(+)
 create mode 100644 encodings/fsst/benches/fsst_contains.rs

diff --git a/Cargo.lock b/Cargo.lock
index 38a21cae5d4..c728c958ce9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1769,6 +1769,12 @@ dependencies = [
  "syn 2.0.117",
 ]
 
+[[package]]
+name = "daachorse"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
+
 [[package]]
 name = "darling"
 version = "0.23.0"
@@ -4731,6 +4737,12 @@ dependencies = [
  "glob",
 ]
 
+[[package]]
+name = "jetscii"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e"
+
 [[package]]
 name = "jiff"
 version = "0.2.23"
@@ -10145,10 +10157,15 @@ dependencies = [
 name = "vortex-fsst"
 version = "0.1.0"
 dependencies = [
+ "aho-corasick",
  "codspeed-divan-compat",
+ "daachorse",
  "fsst-rs",
+ "jetscii",
+ "memchr",
  "prost 0.14.3",
  "rand 0.9.2",
+ "regex-automata",
  "rstest",
  "vortex-array",
  "vortex-buffer",
diff --git a/Cargo.toml b/Cargo.toml
index a025edb2769..b2eea33447b 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -81,6 +81,7 @@ rust-version = "1.90"
 version = "0.1.0"
 
 [workspace.dependencies]
+aho-corasick = "1.1.3"
 anyhow = "1.0.97"
 arbitrary = "1.3.2"
 arc-swap = "1.8"
@@ -121,6 +122,7 @@ cudarc = { version = "0.18.2", features = [
     "cuda-12050",
 ] }
 custom-labels = "0.4.4"
+daachorse = "1.0.0"
 dashmap = "6.1.0"
 datafusion = { version = "52", default-features = false, features = ["sql"] }
 datafusion-catalog = { version = "52" }
@@ -155,6 +157,7 @@ indicatif = "0.18.0"
 insta = "1.43"
 inventory = "0.3.20"
 itertools = "0.14.0"
+jetscii = "0.5.3"
 jiff = "0.2.0"
 kanal = "0.1.1"
 lending-iterator = "0.1.7"
@@ -163,6 +166,7 @@ libloading = "0.8"
 liblzma = "0.4"
 log = { version = "0.4.21" }
 loom = { version = "0.7", features = ["checkpoint"] }
+memchr = "2.8.0"
 memmap2 = "0.9.5"
 mimalloc = "0.1.42"
 moka = { version = "0.12.10", default-features = false }
@@ -196,6 +200,7 @@ rand = "0.9.0"
 rand_distr = "0.5"
 ratatui = { version = "0.30", default-features = false }
 regex = "1.11.0"
+regex-automata = "0.4"
 reqwest = { version = "0.12.4", features = [
     "charset",
     "http2",
diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml
index 0dd5ce55a22..0bcc16b22c9 100644
--- a/encodings/fsst/Cargo.toml
+++ b/encodings/fsst/Cargo.toml
@@ -30,8 +30,13 @@ vortex-session = { workspace = true }
 _test-harness = ["dep:rand", "vortex-array/_test-harness"]
 
 [dev-dependencies]
+aho-corasick = { workspace = true }
+daachorse = { workspace = true }
 divan = { workspace = true }
+jetscii = { workspace = true }
+memchr = { workspace = true }
 rand = { workspace = true }
+regex-automata = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
 
@@ -39,6 +44,10 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
 name = "fsst_compress"
 harness = false
 
+[[bench]]
+name = "fsst_contains"
+harness = false
+
 [[bench]]
 name = "fsst_url_compare"
 harness = false
diff --git a/encodings/fsst/benches/fsst_contains.rs b/encodings/fsst/benches/fsst_contains.rs
new file mode 100644
index 00000000000..187be73cd5b
--- /dev/null
+++ b/encodings/fsst/benches/fsst_contains.rs
@@ -0,0 +1,3650 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+#![allow(
+    clippy::unwrap_used,
+    clippy::cast_possible_truncation,
+    clippy::missing_safety_doc
+)]
+
+use aho_corasick::AhoCorasick;
+use daachorse::DoubleArrayAhoCorasick;
+use divan::Bencher;
+use fsst::ESCAPE_CODE;
+use fsst::Symbol;
+use memchr::memmem;
+use rand::Rng;
+use rand::SeedableRng;
+use rand::rngs::StdRng;
+use regex_automata::dfa::regex::Regex as DfaRegex;
+use vortex_array::ToCanonical;
+use vortex_array::accessor::ArrayAccessor;
+use vortex_array::arrays::VarBinArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_array::match_each_integer_ptype;
+use vortex_buffer::BitBufferMut;
+use vortex_fsst::FSSTArray;
+use vortex_fsst::fsst_compress;
+use vortex_fsst::fsst_train_compressor;
+
+fn main() {
+    divan::main();
+}
+
+// ---------------------------------------------------------------------------
+// URL generator
+// ---------------------------------------------------------------------------
+
+const DOMAINS: &[&str] = &[
+    "google.com",
+    "facebook.com",
+    "github.com",
+    "stackoverflow.com",
+    "amazon.com",
+    "reddit.com",
+    "twitter.com",
+    "youtube.com",
+    "wikipedia.org",
+    "microsoft.com",
+    "apple.com",
+    "netflix.com",
+    "linkedin.com",
+    "cloudflare.com",
+    "google.co.uk",
+    "docs.google.com",
+    "mail.google.com",
+    "maps.google.com",
+    "news.ycombinator.com",
+    "arxiv.org",
+];
+
+const PATHS: &[&str] = &[
+    "/index.html",
+    "/about",
+    "/search?q=vortex",
+    "/user/profile/settings",
+    "/api/v2/data",
+    "/blog/2024/post",
+    "/products/item/12345",
+    "/docs/reference/guide",
+    "/login",
+    "/dashboard/analytics",
+];
+
+fn generate_urls(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..n)
+        .map(|_| {
+            let scheme = if rng.random_bool(0.8) {
+                "https"
+            } else {
+                "http"
+            };
+            let domain = DOMAINS[rng.random_range(0..DOMAINS.len())];
+            let path = PATHS[rng.random_range(0..PATHS.len())];
+            format!("{scheme}://{domain}{path}")
+        })
+        .collect()
+}
+
+fn make_fsst_urls(n: usize) -> FSSTArray {
+    let urls = generate_urls(n);
+    let varbin = VarBinArray::from_iter(
+        urls.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// KMP helpers
+// ---------------------------------------------------------------------------
+
+fn kmp_failure_table(needle: &[u8]) -> Vec<usize> {
+    let mut failure = vec![0usize; needle.len()];
+    let mut k = 0;
+    for i in 1..needle.len() {
+        while k > 0 && needle[k] != needle[i] {
+            k = failure[k - 1];
+        }
+        if needle[k] == needle[i] {
+            k += 1;
+        }
+        failure[i] = k;
+    }
+    failure
+}
+
+fn kmp_byte_transitions(needle: &[u8]) -> Vec<u16> {
+    let n_states = needle.len() + 1;
+    let accept = needle.len() as u16;
+    let failure = kmp_failure_table(needle);
+
+    let mut table = vec![0u16; n_states * 256];
+    for state in 0..n_states {
+        for byte in 0..256u16 {
+            if state == needle.len() {
+                table[state * 256 + byte as usize] = accept;
+                continue;
+            }
+            let mut s = state;
+            loop {
+                if byte as u8 == needle[s] {
+                    s += 1;
+                    break;
+                }
+                if s == 0 {
+                    break;
+                }
+                s = failure[s - 1];
+            }
+            table[state * 256 + byte as usize] = s as u16;
+        }
+    }
+    table
+}
+
+// ---------------------------------------------------------------------------
+// Approach 1: Original split-table DFA (baseline from production code)
+// ---------------------------------------------------------------------------
+
+struct SplitTableDfa {
+    symbol_transitions: Vec<u16>,
+    escape_transitions: Vec<u16>,
+    n_symbols: usize,
+    accept_state: u16,
+}
+
+impl SplitTableDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let n_symbols = symbols.len();
+        let accept_state = needle.len() as u16;
+        let n_states = needle.len() + 1;
+
+        let byte_table = kmp_byte_transitions(needle);
+
+        let mut symbol_transitions = vec![0u16; n_states * n_symbols];
+        for state in 0..n_states {
+            for code in 0..n_symbols {
+                if state as u16 == accept_state {
+                    symbol_transitions[state * n_symbols + code] = accept_state;
+                    continue;
+                }
+                let sym = symbols[code].to_u64().to_le_bytes();
+                let sym_len = symbol_lengths[code] as usize;
+                let mut s = state as u16;
+                for &b in &sym[..sym_len] {
+                    if s == accept_state {
+                        break;
+                    }
+                    s = byte_table[s as usize * 256 + b as usize];
+                }
+                symbol_transitions[state * n_symbols + code] = s;
+            }
+        }
+
+        Self {
+            symbol_transitions,
+            escape_transitions: byte_table,
+            n_symbols,
+            accept_state,
+        }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        let mut state = 0u16;
+        let mut pos = 0;
+        while pos < codes.len() {
+            if state == self.accept_state {
+                return true;
+            }
+            let code = codes[pos];
+            pos += 1;
+            if code == ESCAPE_CODE {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                state = self.escape_transitions[state as usize * 256 + b as usize];
+            } else {
+                state = self.symbol_transitions[state as usize * self.n_symbols + code as usize];
+            }
+        }
+        state == self.accept_state
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 2: Fused 256-entry table (unified lookup, sentinel for escapes)
+// ---------------------------------------------------------------------------
+
+struct FusedTableDfa {
+    transitions: Vec<u16>,
+    escape_transitions: Vec<u16>,
+    accept_state: u16,
+    escape_sentinel: u16,
+}
+
+impl FusedTableDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let n_symbols = symbols.len();
+        let accept_state = needle.len() as u16;
+        let n_states = needle.len() + 1;
+        let escape_sentinel = n_states as u16 + 1;
+
+        let byte_table = kmp_byte_transitions(needle);
+
+        let mut symbol_transitions = vec![0u16; n_states * n_symbols];
+        for state in 0..n_states {
+            for code in 0..n_symbols {
+                if state as u16 == accept_state {
+                    symbol_transitions[state * n_symbols + code] = accept_state;
+                    continue;
+                }
+                let sym = symbols[code].to_u64().to_le_bytes();
+                let sym_len = symbol_lengths[code] as usize;
+                let mut s = state as u16;
+                for &b in &sym[..sym_len] {
+                    if s == accept_state {
+                        break;
+                    }
+                    s = byte_table[s as usize * 256 + b as usize];
+                }
+                symbol_transitions[state * n_symbols + code] = s;
+            }
+        }
+
+        let mut transitions = vec![0u16; n_states * 256];
+        for state in 0..n_states {
+            for code in 0..n_symbols {
+                transitions[state * 256 + code] = symbol_transitions[state * n_symbols + code];
+            }
+            transitions[state * 256 + ESCAPE_CODE as usize] = escape_sentinel;
+        }
+
+        Self {
+            transitions,
+            escape_transitions: byte_table,
+            accept_state,
+            escape_sentinel,
+        }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        let mut state = 0u16;
+        let mut pos = 0;
+        while pos < codes.len() {
+            if state == self.accept_state {
+                return true;
+            }
+            let code = codes[pos];
+            pos += 1;
+            let next = self.transitions[state as usize * 256 + code as usize];
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                state = self.escape_transitions[state as usize * 256 + b as usize];
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+
+    /// No early exit — skip the accept_state check inside the loop.
+    /// Only check at the end. The accept state is sticky (transitions to itself),
+    /// so final state == accept means we matched at some point.
+    #[inline]
+    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
+        let mut state = 0u16;
+        let mut pos = 0;
+        while pos < codes.len() {
+            let code = codes[pos];
+            pos += 1;
+            let next = self.transitions[state as usize * 256 + code as usize];
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                state = self.escape_transitions[state as usize * 256 + b as usize];
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+
+    /// Unsafe variant — eliminates bounds checks on table lookups.
+    #[inline]
+    unsafe fn matches_unchecked(&self, codes: &[u8]) -> bool {
+        unsafe {
+            let mut state = 0u16;
+            let mut pos = 0;
+            let transitions = self.transitions.as_ptr();
+            let escape_transitions = self.escape_transitions.as_ptr();
+            let len = codes.len();
+            let codes_ptr = codes.as_ptr();
+
+            while pos < len {
+                if state == self.accept_state {
+                    return true;
+                }
+                let code = *codes_ptr.add(pos);
+                pos += 1;
+                let next = *transitions.add(state as usize * 256 + code as usize);
+                if next == self.escape_sentinel {
+                    if pos >= len {
+                        return false;
+                    }
+                    let b = *codes_ptr.add(pos);
+                    pos += 1;
+                    state = *escape_transitions.add(state as usize * 256 + b as usize);
+                } else {
+                    state = next;
+                }
+            }
+            state == self.accept_state
+        }
+    }
+
+    /// No early exit + unsafe bounds elimination.
+    #[inline]
+    unsafe fn matches_no_exit_unchecked(&self, codes: &[u8]) -> bool {
+        unsafe {
+            let mut state = 0u16;
+            let mut pos = 0;
+            let transitions = self.transitions.as_ptr();
+            let escape_transitions = self.escape_transitions.as_ptr();
+            let len = codes.len();
+            let codes_ptr = codes.as_ptr();
+
+            while pos < len {
+                let code = *codes_ptr.add(pos);
+                pos += 1;
+                let next = *transitions.add(state as usize * 256 + code as usize);
+                if next == self.escape_sentinel {
+                    if pos >= len {
+                        return false;
+                    }
+                    let b = *codes_ptr.add(pos);
+                    pos += 1;
+                    state = *escape_transitions.add(state as usize * 256 + b as usize);
+                } else {
+                    state = next;
+                }
+            }
+            state == self.accept_state
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 3: Fused u32 table for SIMD gather (process 8 strings at once)
+// ---------------------------------------------------------------------------
+
+#[cfg(target_arch = "x86_64")]
+struct SimdGatherDfa {
+    /// u32 transition table, 256 entries per state.
+    transitions: Vec<u32>,
+    /// u32 escape transition table, 256 entries per state.
+    escape_transitions: Vec<u32>,
+    accept_state: u32,
+    escape_sentinel: u32,
+}
+
+#[cfg(target_arch = "x86_64")]
+impl SimdGatherDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
+
+        Self {
+            transitions: fused.transitions.iter().map(|&v| v as u32).collect(),
+            escape_transitions: fused.escape_transitions.iter().map(|&v| v as u32).collect(),
+            accept_state: fused.accept_state as u32,
+            escape_sentinel: fused.escape_sentinel as u32,
+        }
+    }
+
+    /// Scalar fallback using the u32 tables.
+    #[inline]
+    fn matches_scalar(&self, codes: &[u8]) -> bool {
+        let mut state = 0u32;
+        let mut pos = 0;
+        while pos < codes.len() {
+            if state == self.accept_state {
+                return true;
+            }
+            let code = codes[pos];
+            pos += 1;
+            let next = self.transitions[state as usize * 256 + code as usize];
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                state = self.escape_transitions[state as usize * 256 + b as usize];
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+
+    /// Process 8 strings simultaneously using AVX2 gather for transition lookups.
+    ///
+    /// Each iteration loads one code byte from each of 8 strings, computes
+    /// table indices, and uses VPGATHERDD to fetch 8 transitions at once.
+    #[cfg(target_feature = "avx2")]
+    #[inline]
+    unsafe fn matches_8_avx2(
+        &self,
+        all_bytes: &[u8],
+        starts: &[usize; 8],
+        ends: &[usize; 8],
+    ) -> [bool; 8] {
+        unsafe {
+            let transitions_ptr = self.transitions.as_ptr() as *const i32;
+            let escape_ptr = self.escape_transitions.as_ptr() as *const i32;
+            let bytes_ptr = all_bytes.as_ptr();
+            let accept = self.accept_state;
+            let sentinel = self.escape_sentinel;
+
+            let mut states = [0u32; 8];
+            let mut pos: [usize; 8] = *starts;
+            let mut done = [false; 8];
+
+            loop {
+                let mut any_active = false;
+
+                for k in 0..8 {
+                    if done[k] {
+                        continue;
+                    }
+                    if pos[k] >= ends[k] {
+                        done[k] = true;
+                        continue;
+                    }
+                    any_active = true;
+
+                    let code = *bytes_ptr.add(pos[k]);
+                    pos[k] += 1;
+                    let next =
+                        *transitions_ptr.add(states[k] as usize * 256 + code as usize) as u32;
+                    if next == sentinel {
+                        if pos[k] >= ends[k] {
+                            done[k] = true;
+                            continue;
+                        }
+                        let b = *bytes_ptr.add(pos[k]);
+                        pos[k] += 1;
+                        states[k] = *escape_ptr.add(states[k] as usize * 256 + b as usize) as u32;
+                    } else {
+                        states[k] = next;
+                    }
+                    if states[k] == accept {
+                        done[k] = true;
+                    }
+                }
+                if !any_active {
+                    break;
+                }
+            }
+
+            std::array::from_fn(|k| states[k] == accept)
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 4: Branchless escape handling via combined table
+// Instead of branching on escape sentinel, use a "code_advance" table that
+// tells how many bytes to consume (1 for normal, 2 for escape), and a
+// combined table that gives the right state for both cases.
+// ---------------------------------------------------------------------------
+
+struct BranchlessEscapeDfa {
+    /// For each (state, first_byte, second_byte) triple, the next state.
+    /// But 256*256 per state is too large. Instead:
+    /// For non-escape codes: transitions[state * 256 + code] gives next state.
+    /// For escape code: transitions[state * 256 + 255] is unused; we use
+    /// escape_transitions[state * 256 + literal_byte].
+    ///
+    /// The branchless trick: always read the next byte (speculatively).
+    /// Use a conditional move to select between the normal and escape path.
+    transitions: Vec<u16>,
+    escape_transitions: Vec<u16>,
+    /// 1 for normal codes, 2 for ESCAPE_CODE.
+    code_advance: [u8; 256],
+    accept_state: u16,
+}
+
+impl BranchlessEscapeDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
+
+        let mut code_advance = [1u8; 256];
+        code_advance[ESCAPE_CODE as usize] = 2;
+
+        Self {
+            transitions: fused.transitions,
+            escape_transitions: fused.escape_transitions,
+            code_advance,
+            accept_state: fused.accept_state,
+        }
+    }
+
+    /// Branchless escape handling: speculatively read the next byte and
+    /// select between normal and escape transitions using conditional ops.
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        if codes.is_empty() {
+            return self.accept_state == 0;
+        }
+        let mut state = 0u16;
+        let mut pos = 0;
+        let len = codes.len();
+
+        while pos < len {
+            let code = codes[pos];
+            let advance = self.code_advance[code as usize] as usize;
+
+            // Speculatively read the next byte (needed for escapes).
+            // For non-escape codes this read is wasted but harmless.
+            let next_byte = if pos + 1 < len { codes[pos + 1] } else { 0 };
+
+            let normal_next = self.transitions[state as usize * 256 + code as usize];
+            let escape_next = self.escape_transitions[state as usize * 256 + next_byte as usize];
+
+            // Select: if this is an escape code, use escape_next; otherwise normal_next.
+            let is_escape = code == ESCAPE_CODE;
+            state = if is_escape { escape_next } else { normal_next };
+
+            pos += advance;
+
+            if state == self.accept_state {
+                return true;
+            }
+        }
+        state == self.accept_state
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 5: u8 state table — halve table size (u16→u8) since states fit in
+// a byte. Smaller tables = better cache utilization.
+// ---------------------------------------------------------------------------
+
+struct CompactDfa {
+    /// u8 transitions, 256 entries per state.
+    transitions: Vec<u8>,
+    escape_transitions: Vec<u8>,
+    accept_state: u8,
+    escape_sentinel: u8,
+}
+
+impl CompactDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
+        Self {
+            transitions: fused.transitions.iter().map(|&v| v as u8).collect(),
+            escape_transitions: fused.escape_transitions.iter().map(|&v| v as u8).collect(),
+            accept_state: fused.accept_state as u8,
+            escape_sentinel: fused.escape_sentinel as u8,
+        }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        let mut state = 0u8;
+        let mut pos = 0;
+        while pos < codes.len() {
+            if state == self.accept_state {
+                return true;
+            }
+            let code = codes[pos];
+            pos += 1;
+            let next = self.transitions[state as usize * 256 + code as usize];
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                state = self.escape_transitions[state as usize * 256 + b as usize];
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+
+    #[inline]
+    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
+        let mut state = 0u8;
+        let mut pos = 0;
+        while pos < codes.len() {
+            let code = codes[pos];
+            pos += 1;
+            let next = self.transitions[state as usize * 256 + code as usize];
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                state = self.escape_transitions[state as usize * 256 + b as usize];
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+
+    /// Unsafe no-exit variant.
+    #[inline]
+    unsafe fn matches_no_exit_unchecked(&self, codes: &[u8]) -> bool {
+        unsafe {
+            let mut state = 0u8;
+            let mut pos = 0;
+            let transitions = self.transitions.as_ptr();
+            let escape_transitions = self.escape_transitions.as_ptr();
+            let len = codes.len();
+            let codes_ptr = codes.as_ptr();
+
+            while pos < len {
+                let code = *codes_ptr.add(pos);
+                pos += 1;
+                let next = *transitions.add(state as usize * 256 + code as usize);
+                if next == self.escape_sentinel {
+                    if pos >= len {
+                        return false;
+                    }
+                    let b = *codes_ptr.add(pos);
+                    pos += 1;
+                    state = *escape_transitions.add(state as usize * 256 + b as usize);
+                } else {
+                    state = next;
+                }
+            }
+            state == self.accept_state
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 6: Streaming scan — process the ENTIRE codes buffer in one pass,
+// resetting state at string boundaries. Avoids per-string slice overhead
+// and is friendlier to the hardware prefetcher.
+// ---------------------------------------------------------------------------
+
+#[inline(never)]
+#[allow(dead_code)]
+fn streaming_scan_fused(
+    dfa: &FusedTableDfa,
+    all_bytes: &[u8],
+    offsets: &[usize],
+    n: usize,
+) -> BitBufferMut {
+    BitBufferMut::collect_bool(n, |i| {
+        // The collect_bool closure is called sequentially for i=0..n.
+        // We rely on the sequential access pattern being prefetch-friendly.
+        let start = offsets[i];
+        let end = offsets[i + 1];
+        dfa.matches(&all_bytes[start..end])
+    })
+}
+
+/// True streaming: single pass through all_bytes with offset-based reset.
+#[inline(never)]
+fn streaming_scan_continuous(
+    dfa: &CompactDfa,
+    all_bytes: &[u8],
+    offsets: &[usize],
+    n: usize,
+    out: &mut BitBufferMut,
+) {
+    let mut string_idx = 0;
+    let mut state = 0u8;
+    let mut next_boundary = offsets[1];
+    let mut matched = false;
+
+    let mut pos = offsets[0];
+    let total_end = offsets[n];
+
+    while pos < total_end {
+        // Check if we've crossed into a new string.
+        while pos >= next_boundary {
+            // Record result for the just-finished string.
+            if matched || state == dfa.accept_state {
+                out.set(string_idx);
+            }
+            string_idx += 1;
+            if string_idx >= n {
+                return;
+            }
+            state = 0;
+            matched = false;
+            next_boundary = offsets[string_idx + 1];
+        }
+
+        let code = all_bytes[pos];
+        pos += 1;
+        let next = dfa.transitions[state as usize * 256 + code as usize];
+        if next == dfa.escape_sentinel {
+            if pos < next_boundary {
+                let b = all_bytes[pos];
+                pos += 1;
+                state = dfa.escape_transitions[state as usize * 256 + b as usize];
+            }
+        } else {
+            state = next;
+        }
+        if state == dfa.accept_state {
+            matched = true;
+        }
+    }
+
+    // Handle the last string.
+    if string_idx < n && (matched || state == dfa.accept_state) {
+        out.set(string_idx);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 7: Prefilter — build a bitmask of codes that could possibly
+// contribute to matching the needle. Skip DFA for strings where no code
+// belongs to that set.
+// ---------------------------------------------------------------------------
+
+struct PrefilterDfa {
+    inner: CompactDfa,
+    /// For each code byte (0..255), true if that code could produce any byte
+    /// present in the needle (i.e., the symbol's bytes intersect needle's bytes).
+    relevant_codes: [bool; 256],
+}
+
+impl PrefilterDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let inner = CompactDfa::new(symbols, symbol_lengths, needle);
+
+        // Build set of bytes that appear in the needle.
+        let mut needle_bytes = [false; 256];
+        for &b in needle {
+            needle_bytes[b as usize] = true;
+        }
+
+        // For each symbol code, check if any of its bytes appear in the needle.
+        let mut relevant_codes = [false; 256];
+        for (code, (sym, &sym_len)) in symbols.iter().zip(symbol_lengths.iter()).enumerate() {
+            let sym_bytes = sym.to_u64().to_le_bytes();
+            for &b in &sym_bytes[..sym_len as usize] {
+                if needle_bytes[b as usize] {
+                    relevant_codes[code] = true;
+                    break;
+                }
+            }
+        }
+        // Escape code is always relevant (literal bytes could be anything).
+        relevant_codes[ESCAPE_CODE as usize] = true;
+
+        Self {
+            inner,
+            relevant_codes,
+        }
+    }
+
+    /// Quick check: does this code sequence contain any code that could
+    /// contribute to the needle match?
+    #[inline]
+    fn could_match(&self, codes: &[u8]) -> bool {
+        codes.iter().any(|&c| self.relevant_codes[c as usize])
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        if !self.could_match(codes) {
+            return false;
+        }
+        self.inner.matches(codes)
+    }
+
+    #[inline]
+    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
+        if !self.could_match(codes) {
+            return false;
+        }
+        self.inner.matches_no_early_exit(codes)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 8: State-zero skip DFA — skip runs of codes that keep state=0.
+//
+// Precompute a 256-byte lookup: for each code byte, does transitioning from
+// state 0 stay in state 0? If so, that code is "trivial" and can be skipped.
+// Process codes in chunks: scan for the first non-trivial code, then run
+// the scalar DFA from there. This is most effective when the needle is rare
+// (most codes are trivial), which is the common case for selective predicates.
+// ---------------------------------------------------------------------------
+
+struct StateZeroSkipDfa {
+    inner: CompactDfa,
+    /// For each code byte (0..255), true if it keeps state 0 → state 0.
+    trivial: [bool; 256],
+}
+
+impl StateZeroSkipDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let inner = CompactDfa::new(symbols, symbol_lengths, needle);
+
+        let mut trivial = [false; 256];
+        for code in 0..256 {
+            // A code is trivial if from state 0 it goes back to state 0
+            // and it's not the escape sentinel.
+            let next = inner.transitions[code]; // state 0 * 256 + code
+            trivial[code] = next == 0 && code as u8 != ESCAPE_CODE;
+        }
+
+        Self { inner, trivial }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        // Skip leading trivial codes.
+        let mut start = 0;
+        while start < codes.len() && self.trivial[codes[start] as usize] {
+            start += 1;
+        }
+        if start == codes.len() {
+            return self.inner.accept_state == 0;
+        }
+        // Run the DFA from the first non-trivial code.
+        self.inner.matches_no_early_exit(&codes[start..])
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 9: Shift-based DFA — pack all state transitions into a u64.
+//
+// For a DFA with S ≤ 21 states (3 bits each fit in 63 bits of a u64),
+// we store the transitions for ALL states for a given input byte in one u64.
+// Transition: next_state = (table[code_byte] >> (state * BITS)) & MASK
+//
+// The key advantage: the table load depends only on code_byte (known from
+// the input stream), NOT on the current state. This breaks the load-use
+// dependency chain that makes traditional table-lookup DFAs slow (~4 cycle
+// L1 latency per transition). With the shift-based approach, the table
+// value can be loaded while the previous transition's shift is executing.
+// ---------------------------------------------------------------------------
+
+struct ShiftDfa {
+    /// For each code byte (0..255): a u64 packing all state transitions.
+    /// Bits [state*3 .. state*3+3) encode the next state for that input.
+    transitions: [u64; 256],
+    /// Same layout for escape byte transitions.
+    escape_transitions: [u64; 256],
+    accept_state: u8,
+    escape_sentinel: u8,
+}
+
+impl ShiftDfa {
+    const BITS: u32 = 4; // bits per state (supports up to 16 states = 2^4)
+    const MASK: u64 = (1 << Self::BITS) - 1;
+
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        assert!(
+            needle.len() + 2 <= (1 << Self::BITS),
+            "needle too long for 4-bit states (max 14 chars)"
+        );
+
+        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
+
+        // Pack the fused u16 transitions into u64 shift tables.
+        let n_states = needle.len() + 1;
+        let escape_sentinel_u8 = fused.escape_sentinel as u8;
+
+        let mut transitions = [0u64; 256];
+        let mut escape_transitions = [0u64; 256];
+
+        for code_byte in 0..256usize {
+            let mut packed = 0u64;
+            for state in 0..n_states {
+                let next = fused.transitions[state * 256 + code_byte];
+                // Map the escape sentinel to a value that fits in 3 bits.
+                let next_u8 = if next == fused.escape_sentinel {
+                    escape_sentinel_u8
+                } else {
+                    next as u8
+                };
+                packed |= (next_u8 as u64) << (state as u32 * Self::BITS);
+            }
+            transitions[code_byte] = packed;
+        }
+
+        for byte_val in 0..256usize {
+            let mut packed = 0u64;
+            for state in 0..n_states {
+                let next = fused.escape_transitions[state * 256 + byte_val] as u8;
+                packed |= (next as u64) << (state as u32 * Self::BITS);
+            }
+            escape_transitions[byte_val] = packed;
+        }
+
+        Self {
+            transitions,
+            escape_transitions,
+            accept_state: fused.accept_state as u8,
+            escape_sentinel: escape_sentinel_u8,
+        }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        let mut state = 0u8;
+        let mut pos = 0;
+        while pos < codes.len() {
+            if state == self.accept_state {
+                return true;
+            }
+            let code = codes[pos];
+            pos += 1;
+            // The table load depends only on `code`, not on `state`.
+            // The shift depends on `state` but is a fast register op.
+            let packed = self.transitions[code as usize];
+            let next = ((packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                let esc_packed = self.escape_transitions[b as usize];
+                state = ((esc_packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+
+    #[inline]
+    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
+        let mut state = 0u8;
+        let mut pos = 0;
+        while pos < codes.len() {
+            let code = codes[pos];
+            pos += 1;
+            let packed = self.transitions[code as usize];
+            let next = ((packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
+            if next == self.escape_sentinel {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                let esc_packed = self.escape_transitions[b as usize];
+                state = ((esc_packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
+            } else {
+                state = next;
+            }
+        }
+        state == self.accept_state
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Hybrid 1: Prefilter + ShiftDfa — skip strings with no relevant codes,
+// then use the fastest DFA (ShiftDfa) for survivors.
+// ---------------------------------------------------------------------------
+
+struct PrefilterShiftDfa {
+    inner: ShiftDfa,
+    relevant_codes: [bool; 256],
+}
+
+impl PrefilterShiftDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let inner = ShiftDfa::new(symbols, symbol_lengths, needle);
+
+        let mut needle_bytes = [false; 256];
+        for &b in needle {
+            needle_bytes[b as usize] = true;
+        }
+
+        let mut relevant_codes = [false; 256];
+        for (code, (sym, &sym_len)) in symbols.iter().zip(symbol_lengths.iter()).enumerate() {
+            let sym_bytes = sym.to_u64().to_le_bytes();
+            for &b in &sym_bytes[..sym_len as usize] {
+                if needle_bytes[b as usize] {
+                    relevant_codes[code] = true;
+                    break;
+                }
+            }
+        }
+        relevant_codes[ESCAPE_CODE as usize] = true;
+
+        Self {
+            inner,
+            relevant_codes,
+        }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        if !codes.iter().any(|&c| self.relevant_codes[c as usize]) {
+            return false;
+        }
+        self.inner.matches_no_early_exit(codes)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Hybrid 2: StateZero skip + ShiftDfa — skip leading trivial codes,
+// then use ShiftDfa for the remainder.
+// ---------------------------------------------------------------------------
+
+struct StateZeroShiftDfa {
+    inner: ShiftDfa,
+    trivial: [bool; 256],
+}
+
+impl StateZeroShiftDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let inner = ShiftDfa::new(symbols, symbol_lengths, needle);
+
+        let mut trivial = [false; 256];
+        for code in 0..256 {
+            let packed = inner.transitions[code];
+            let next = (packed & ShiftDfa::MASK) as u8;
+            trivial[code] = next == 0 && code as u8 != ESCAPE_CODE;
+        }
+
+        Self { inner, trivial }
+    }
+
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        let mut start = 0;
+        while start < codes.len() && self.trivial[codes[start] as usize] {
+            start += 1;
+        }
+        if start == codes.len() {
+            return self.inner.accept_state == 0;
+        }
+        self.inner.matches_no_early_exit(&codes[start..])
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 9: Sheng DFA — use SSSE3 PSHUFB for transitions.
+//
+// The state is a byte position in an XMM register. For each input byte,
+// we load a 16-byte shuffle mask and do PSHUFB(mask, state_vec).
+// PSHUFB uses the low 4 bits of each byte lane as an index into the mask,
+// producing the next state. With ≤16 states this is a single instruction.
+//
+// The shuffle mask load depends only on the input byte (not on state),
+// so it can be loaded in parallel with the previous PSHUFB's execution.
+// Throughput: ~1 byte/cycle (limited by PSHUFB throughput of 1/cycle on
+// most microarchitectures).
+// ---------------------------------------------------------------------------
+
+#[cfg(target_arch = "x86_64")]
+struct ShengDfa {
+    /// 256 shuffle masks, one per possible input byte.
+    /// Each mask is 16 bytes: mask[i] = next_state when current state == i.
+    masks: Vec<std::arch::x86_64::__m128i>,
+    /// 256 escape masks for escaped byte values.
+    escape_masks: Vec<std::arch::x86_64::__m128i>,
+    accept_state: u8,
+    escape_sentinel: u8,
+}
+
+#[cfg(target_arch = "x86_64")]
+impl ShengDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        use std::arch::x86_64::_mm_set_epi8;
+
+        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
+        let escape_sentinel = fused.escape_sentinel as u8;
+
+        let mut masks = Vec::with_capacity(256);
+        let mut escape_masks = Vec::with_capacity(256);
+
+        for code_byte in 0..256usize {
+            let mut mask_bytes = [0u8; 16];
+            for state in 0..16 {
+                if state < needle.len() + 1 {
+                    let next = fused.transitions[state * 256 + code_byte];
+                    mask_bytes[state] = if next == fused.escape_sentinel {
+                        escape_sentinel
+                    } else {
+                        next as u8
+                    };
+                }
+            }
+            masks.push(unsafe {
+                _mm_set_epi8(
+                    mask_bytes[15] as i8,
+                    mask_bytes[14] as i8,
+                    mask_bytes[13] as i8,
+                    mask_bytes[12] as i8,
+                    mask_bytes[11] as i8,
+                    mask_bytes[10] as i8,
+                    mask_bytes[9] as i8,
+                    mask_bytes[8] as i8,
+                    mask_bytes[7] as i8,
+                    mask_bytes[6] as i8,
+                    mask_bytes[5] as i8,
+                    mask_bytes[4] as i8,
+                    mask_bytes[3] as i8,
+                    mask_bytes[2] as i8,
+                    mask_bytes[1] as i8,
+                    mask_bytes[0] as i8,
+                )
+            });
+        }
+
+        for byte_val in 0..256usize {
+            let mut mask_bytes = [0u8; 16];
+            for state in 0..16 {
+                if state < needle.len() + 1 {
+                    mask_bytes[state] = fused.escape_transitions[state * 256 + byte_val] as u8;
+                }
+            }
+            escape_masks.push(unsafe {
+                _mm_set_epi8(
+                    mask_bytes[15] as i8,
+                    mask_bytes[14] as i8,
+                    mask_bytes[13] as i8,
+                    mask_bytes[12] as i8,
+                    mask_bytes[11] as i8,
+                    mask_bytes[10] as i8,
+                    mask_bytes[9] as i8,
+                    mask_bytes[8] as i8,
+                    mask_bytes[7] as i8,
+                    mask_bytes[6] as i8,
+                    mask_bytes[5] as i8,
+                    mask_bytes[4] as i8,
+                    mask_bytes[3] as i8,
+                    mask_bytes[2] as i8,
+                    mask_bytes[1] as i8,
+                    mask_bytes[0] as i8,
+                )
+            });
+        }
+
+        Self {
+            masks,
+            escape_masks,
+            accept_state: fused.accept_state as u8,
+            escape_sentinel,
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "ssse3")]
+    unsafe fn matches(&self, codes: &[u8]) -> bool {
+        use std::arch::x86_64::_mm_extract_epi8;
+        use std::arch::x86_64::_mm_set1_epi8;
+        use std::arch::x86_64::_mm_shuffle_epi8;
+
+        unsafe {
+            let mut state_vec = _mm_set1_epi8(0);
+            let mut pos = 0;
+
+            while pos < codes.len() {
+                let cur_state = _mm_extract_epi8::<0>(state_vec) as u8;
+                if cur_state == self.accept_state {
+                    return true;
+                }
+
+                let code = codes[pos];
+                pos += 1;
+
+                // One PSHUFB: the mask load depends only on `code`, not state.
+                let next_vec = _mm_shuffle_epi8(self.masks[code as usize], state_vec);
+                let next_state = _mm_extract_epi8::<0>(next_vec) as u8;
+
+                if next_state == self.escape_sentinel {
+                    if pos >= codes.len() {
+                        return false;
+                    }
+                    let b = codes[pos];
+                    pos += 1;
+                    state_vec = _mm_shuffle_epi8(self.escape_masks[b as usize], state_vec);
+                } else {
+                    state_vec = next_vec;
+                }
+            }
+
+            _mm_extract_epi8::<0>(state_vec) as u8 == self.accept_state
+        }
+    }
+
+    #[inline]
+    #[target_feature(enable = "ssse3")]
+    unsafe fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
+        use std::arch::x86_64::_mm_extract_epi8;
+        use std::arch::x86_64::_mm_set1_epi8;
+        use std::arch::x86_64::_mm_shuffle_epi8;
+
+        unsafe {
+            let mut state_vec = _mm_set1_epi8(0);
+            let mut pos = 0;
+
+            while pos < codes.len() {
+                let code = codes[pos];
+                pos += 1;
+
+                let next_vec = _mm_shuffle_epi8(self.masks[code as usize], state_vec);
+                let next_state = _mm_extract_epi8::<0>(next_vec) as u8;
+
+                if next_state == self.escape_sentinel {
+                    if pos >= codes.len() {
+                        return false;
+                    }
+                    let b = codes[pos];
+                    pos += 1;
+                    state_vec = _mm_shuffle_epi8(self.escape_masks[b as usize], state_vec);
+                } else {
+                    state_vec = next_vec;
+                }
+            }
+
+            _mm_extract_epi8::<0>(state_vec) as u8 == self.accept_state
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 10: Speculative/Enumerated DFA — run from ALL start states at once.
+//
+// For a DFA with S states and a code sequence of length L, we process codes
+// sequentially but track S states simultaneously. Each "state" in our vector
+// is the result of starting from a different initial state. After processing
+// the full sequence, we look up the result for initial state 0.
+//
+// Why is this useful? It enables processing codes in independent chunks:
+// each chunk can run in parallel, and results are chained by composing
+// the state-to-state mappings. For small S this is very efficient.
+// ---------------------------------------------------------------------------
+
+struct EnumeratedDfa {
+    /// For each (state, code_byte): next state. 256 entries per state.
+    transitions: Vec<u16>,
+    escape_transitions: Vec<u16>,
+    n_states: usize,
+    accept_state: u16,
+    escape_sentinel: u16,
+}
+
+impl EnumeratedDfa {
+    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
+        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
+        Self {
+            transitions: fused.transitions,
+            escape_transitions: fused.escape_transitions,
+            n_states: needle.len() + 1,
+            accept_state: fused.accept_state,
+            escape_sentinel: fused.escape_sentinel,
+        }
+    }
+
+    /// Process a single code sequence by tracking all possible start states.
+    /// Returns true if starting from state 0 reaches accept.
+    #[inline]
+    fn matches(&self, codes: &[u8]) -> bool {
+        // For each possible start state, track where it ends up.
+        // state_map[s] = "if we started in state s, we'd now be in state state_map[s]"
+        let ns = self.n_states;
+        let mut state_map: [u16; 16] = [0; 16]; // supports up to 16 states
+        for s in 0..ns {
+            state_map[s] = s as u16;
+        }
+
+        let mut pos = 0;
+        while pos < codes.len() {
+            let code = codes[pos];
+            pos += 1;
+
+            let next_fn = self.transitions.as_ptr();
+            let esc_fn = self.escape_transitions.as_ptr();
+
+            if code == ESCAPE_CODE {
+                if pos >= codes.len() {
+                    return false;
+                }
+                let b = codes[pos];
+                pos += 1;
+                for s in 0..ns {
+                    let cur = state_map[s];
+                    state_map[s] = unsafe { *esc_fn.add(cur as usize * 256 + b as usize) };
+                }
+            } else {
+                for s in 0..ns {
+                    let cur = state_map[s];
+                    let next = unsafe { *next_fn.add(cur as usize * 256 + code as usize) };
+                    state_map[s] = if next == self.escape_sentinel {
+                        // shouldn't happen for non-escape codes
+                        cur
+                    } else {
+                        next
+                    };
+                }
+            }
+
+            // Early exit: if starting from state 0 we've already accepted
+            if state_map[0] == self.accept_state {
+                return true;
+            }
+        }
+
+        state_map[0] == self.accept_state
+    }
+
+    /// Chunked parallel version: split codes into chunks, process each chunk
+    #[allow(dead_code)]
+    /// to get a state mapping, then compose mappings.
+    #[inline]
+    fn matches_chunked(&self, codes: &[u8], chunk_size: usize) -> bool {
+        if codes.is_empty() {
+            return self.accept_state == 0;
+        }
+
+        let ns = self.n_states;
+
+        // Process the full sequence but in chunks, building state maps that
+        // could theoretically be parallelized.
+        let mut global_map: [u16; 16] = [0; 16];
+        for s in 0..ns {
+            global_map[s] = s as u16;
+        }
+
+        // We still process sequentially here but the structure allows future
+        // parallelization with rayon/SIMD on independent chunks.
+        let mut pos = 0;
+        while pos < codes.len() {
+            let chunk_end = (pos + chunk_size).min(codes.len());
+
+            // Build mapping for this chunk: for each start state, what's the end state?
+            let mut chunk_map: [u16; 16] = [0; 16];
+            for start_state in 0..ns {
+                let mut state = start_state as u16;
+                let mut p = pos;
+                while p < chunk_end {
+                    let code = codes[p];
+                    p += 1;
+                    let next = self.transitions[state as usize * 256 + code as usize];
+                    if next == self.escape_sentinel {
+                        if p >= chunk_end {
+                            // Escape spans chunk boundary — just do the lookup
+                            // with byte 0 as placeholder, will be corrected
+                            break;
+                        }
+                        let b = codes[p];
+                        p += 1;
+                        state = self.escape_transitions[state as usize * 256 + b as usize];
+                    } else {
+                        state = next;
+                    }
+                }
+                chunk_map[start_state] = state;
+            }
+
+            // Compose: global_map = chunk_map(global_map)
+            let mut new_global: [u16; 16] = [0; 16];
+            for s in 0..ns {
+                new_global[s] = chunk_map[global_map[s] as usize];
+            }
+            global_map = new_global;
+
+            pos = chunk_end;
+        }
+
+        global_map[0] == self.accept_state
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Approach 6: Speculative multi-string — process multiple strings, each with
+// early-exit SIMD checking across the batch after each code step.
+// ---------------------------------------------------------------------------
+
+impl FusedTableDfa {
+    /// Process N strings at once. After each code step, check if ALL strings
+    /// have resolved (accepted or exhausted). Uses u16 states packed for
+    /// potential SIMD comparison.
+    #[inline]
+    fn matches_multi_early_exit<const N: usize>(
+        &self,
+        all_bytes: &[u8],
+        starts: &[usize; N],
+        ends: &[usize; N],
+    ) -> [bool; N] {
+        let mut states = [0u16; N];
+        let mut pos = *starts;
+        let mut resolved = 0u32; // bitmask of resolved strings
+
+        let all_resolved = (1u32 << N) - 1;
+
+        loop {
+            if resolved == all_resolved {
+                break;
+            }
+
+            let mut any_progress = false;
+            for k in 0..N {
+                if resolved & (1 << k) != 0 {
+                    continue;
+                }
+                if pos[k] >= ends[k] {
+                    resolved |= 1 << k;
+                    continue;
+                }
+                any_progress = true;
+
+                let code = all_bytes[pos[k]];
+                pos[k] += 1;
+                let next = self.transitions[states[k] as usize * 256 + code as usize];
+                if next == self.escape_sentinel {
+                    if pos[k] >= ends[k] {
+                        resolved |= 1 << k;
+                        continue;
+                    }
+                    let b = all_bytes[pos[k]];
+                    pos[k] += 1;
+                    states[k] = self.escape_transitions[states[k] as usize * 256 + b as usize];
+                } else {
+                    states[k] = next;
+                }
+                if states[k] == self.accept_state {
+                    resolved |= 1 << k;
+                }
+            }
+            if !any_progress {
+                break;
+            }
+        }
+
+        std::array::from_fn(|k| states[k] == self.accept_state)
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Pre-extracted data for alloc-free benchmarking
+// ---------------------------------------------------------------------------
+
+struct PreparedArray {
+    all_bytes: Vec<u8>,
+    offsets: Vec<usize>,
+    n: usize,
+}
+
+impl PreparedArray {
+    fn from_fsst(array: &FSSTArray) -> Self {
+        let codes = array.codes();
+        let offsets_prim = codes.offsets().to_primitive();
+        let all_bytes = codes.bytes();
+        let all_bytes = all_bytes.as_slice().to_vec();
+        let n = codes.len();
+
+        let offsets: Vec<usize> = match_each_integer_ptype!(offsets_prim.ptype(), |T| {
+            offsets_prim
+                .as_slice::<T>()
+                .iter()
+                .map(|&v| v as usize)
+                .collect()
+        });
+
+        Self {
+            all_bytes,
+            offsets,
+            n,
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Benchmark helpers
+// ---------------------------------------------------------------------------
+
+#[inline(never)]
+fn run_split(dfa: &SplitTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+    }
+}
+
+#[inline(never)]
+fn run_fused(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+    }
+}
+
+#[inline(never)]
+fn run_fused_no_exit(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches_no_early_exit(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+    }
+}
+
+#[inline(never)]
+fn run_fused_unsafe(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if unsafe { dfa.matches_unchecked(&prep.all_bytes[start..end]) } {
+            out.set(i);
+        }
+    }
+}
+
+#[inline(never)]
+fn run_fused_no_exit_unsafe(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) } {
+            out.set(i);
+        }
+    }
+}
+
+#[inline(never)]
+fn run_branchless(dfa: &BranchlessEscapeDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+    }
+}
+
+#[cfg(target_arch = "x86_64")]
+#[inline(never)]
+fn run_simd_gather_8(dfa: &SimdGatherDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    let mut i = 0;
+    while i + 8 <= prep.n {
+        let starts: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k]);
+        let ends: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k + 1]);
+
+        #[cfg(target_feature = "avx2")]
+        let results = unsafe { dfa.matches_8_avx2(&prep.all_bytes, &starts, &ends) };
+        #[cfg(not(target_feature = "avx2"))]
+        let results = {
+            let mut r = [false; 8];
+            for k in 0..8 {
+                r[k] = dfa.matches_scalar(&prep.all_bytes[starts[k]..ends[k]]);
+            }
+            r
+        };
+
+        for k in 0..8 {
+            if results[k] {
+                out.set(i + k);
+            }
+        }
+        i += 8;
+    }
+    // Remainder
+    while i < prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches_scalar(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+        i += 1;
+    }
+}
+
+#[inline(never)]
+fn run_compact(dfa: &CompactDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+    }
+}
+
+#[inline(never)]
+fn run_prefilter(dfa: &PrefilterDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        if dfa.matches(&prep.all_bytes[start..end]) {
+            out.set(i);
+        }
+    }
+}
+
+fn bench_decompress(array: &FSSTArray, needle: &[u8], out: &mut Vec<bool>) {
+    out.clear();
+    let decompressor = array.decompressor();
+    array.codes().with_iterator(|iter| {
+        out.extend(iter.map(|codes| match codes {
+            Some(c) => {
+                let decompressed = decompressor.decompress(c);
+                decompressed.windows(needle.len()).any(|w| w == needle)
+            }
+            None => false,
+        }));
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Alloc-free decompress + match: reuse a buffer, inline the decompress logic.
+// This measures pure decompress+search cost without per-string allocation.
+// ---------------------------------------------------------------------------
+
+/// Decompress FSST codes into `buf`, returning the number of bytes written.
+/// This avoids all allocation by writing into a caller-provided buffer.
+#[inline]
+fn decompress_into(codes: &[u8], symbols: &[Symbol], symbol_lengths: &[u8], buf: &mut Vec<u8>) {
+    buf.clear();
+    let mut pos = 0;
+    while pos < codes.len() {
+        let code = codes[pos];
+        pos += 1;
+        if code == ESCAPE_CODE {
+            if pos < codes.len() {
+                buf.push(codes[pos]);
+                pos += 1;
+            }
+        } else {
+            let sym = symbols[code as usize].to_u64().to_le_bytes();
+            let len = symbol_lengths[code as usize] as usize;
+            buf.extend_from_slice(&sym[..len]);
+        }
+    }
+}
+
+/// Alloc-free decompress + sliding window match using PreparedArray.
+/// Pre-allocates the decompression buffer once outside the benchmark loop.
+#[inline(never)]
+fn run_decompress_match(
+    prep: &PreparedArray,
+    symbols: &[Symbol],
+    symbol_lengths: &[u8],
+    needle: &[u8],
+    buf: &mut Vec<u8>,
+    out: &mut BitBufferMut,
+) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        decompress_into(&prep.all_bytes[start..end], symbols, symbol_lengths, buf);
+        if buf.windows(needle.len()).any(|w| w == needle) {
+            out.set(i);
+        }
+    }
+}
+
+/// Alloc-free decompress + memmem match using PreparedArray.
+#[inline(never)]
+fn run_decompress_memmem(
+    prep: &PreparedArray,
+    symbols: &[Symbol],
+    symbol_lengths: &[u8],
+    needle: &[u8],
+    buf: &mut Vec<u8>,
+    out: &mut BitBufferMut,
+) {
+    let finder = memmem::Finder::new(needle);
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        decompress_into(&prep.all_bytes[start..end], symbols, symbol_lengths, buf);
+        if finder.find(buf).is_some() {
+            out.set(i);
+        }
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks
+// ---------------------------------------------------------------------------
+
+const N: usize = 100_000;
+const NEEDLE: &[u8] = b"google";
+
+// ---------------------------------------------------------------------------
+// ClickBench-style URL generator (longer, more realistic URLs with query
+// params, fragments, UTM tracking, referrers, etc.)
+// ---------------------------------------------------------------------------
+
+const CB_DOMAINS: &[&str] = &[
+    "www.google.com",
+    "yandex.ru",
+    "mail.ru",
+    "vk.com",
+    "www.youtube.com",
+    "www.facebook.com",
+    "ok.ru",
+    "go.mail.ru",
+    "www.avito.ru",
+    "pogoda.yandex.ru",
+    "news.yandex.ru",
+    "maps.yandex.ru",
+    "market.yandex.ru",
+    "afisha.yandex.ru",
+    "auto.ru",
+    "www.kinopoisk.ru",
+    "www.ozon.ru",
+    "www.wildberries.ru",
+    "aliexpress.ru",
+    "lenta.ru",
+];
+
+const CB_PATHS: &[&str] = &[
+    "/search",
+    "/catalog/electronics/smartphones",
+    "/product/item/123456789",
+    "/news/2024/03/15/article-about-technology",
+    "/user/profile/settings/notifications",
+    "/api/v2/catalog/search",
+    "/checkout/cart/summary",
+    "/blog/2024/how-to-optimize-database-queries-for-better-performance",
+    "/category/home-and-garden/furniture/tables",
+    "/",
+];
+
+const CB_PARAMS: &[&str] = &[
+    "?utm_source=google&utm_medium=cpc&utm_campaign=spring_sale_2024&utm_content=banner_v2",
+    "?q=buy+smartphone+online+cheap+free+shipping&category=electronics&sort=price_asc&page=3",
+    "?ref=main_page_carousel_block_position_4&sessionid=abc123def456",
+    "?from=tabbar&clid=2270455&text=weather+forecast+tomorrow",
+    "?lr=213&msid=1234567890.12345&suggest_reqid=abcdef&csg=12345",
+    "",
+    "",
+    "",
+    "?page=1&per_page=20",
+    "?source=serp&forceshow=1",
+];
+
+const CB_FRAGMENTS: &[&str] = &[
+    "",
+    "",
+    "",
+    "#section-reviews",
+    "#comments",
+    "#price-history",
+    "",
+    "",
+    "",
+    "",
+];
+
+fn generate_clickbench_urls(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(123);
+    (0..n)
+        .map(|_| {
+            let scheme = if rng.random_bool(0.7) {
+                "https"
+            } else {
+                "http"
+            };
+            let domain = CB_DOMAINS[rng.random_range(0..CB_DOMAINS.len())];
+            let path = CB_PATHS[rng.random_range(0..CB_PATHS.len())];
+            let params = CB_PARAMS[rng.random_range(0..CB_PARAMS.len())];
+            let fragment = CB_FRAGMENTS[rng.random_range(0..CB_FRAGMENTS.len())];
+            format!("{scheme}://{domain}{path}{params}{fragment}")
+        })
+        .collect()
+}
+
+fn make_fsst_clickbench_urls(n: usize) -> FSSTArray {
+    let urls = generate_clickbench_urls(n);
+    let varbin = VarBinArray::from_iter(
+        urls.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+const CB_NEEDLE: &[u8] = b"yandex";
+
+// ---------------------------------------------------------------------------
+// Log lines generator (Apache/nginx-style access logs)
+// ---------------------------------------------------------------------------
+
+const LOG_METHODS: &[&str] = &["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD"];
+const LOG_PATHS: &[&str] = &[
+    "/api/v1/users",
+    "/api/v2/products/search",
+    "/healthcheck",
+    "/static/js/app.bundle.min.js",
+    "/favicon.ico",
+    "/login",
+    "/dashboard/analytics",
+    "/api/v1/orders/12345/status",
+    "/graphql",
+    "/metrics",
+];
+const LOG_STATUS: &[u16] = &[
+    200, 200, 200, 200, 200, 201, 301, 302, 400, 403, 404, 500, 502,
+];
+const LOG_IPS: &[&str] = &[
+    "192.168.1.1",
+    "10.0.0.42",
+    "172.16.0.100",
+    "203.0.113.50",
+    "198.51.100.23",
+    "8.8.8.8",
+    "1.1.1.1",
+    "74.125.200.100",
+    "151.101.1.69",
+    "93.184.216.34",
+];
+const LOG_UAS: &[&str] = &[
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
+    "curl/7.81.0",
+    "python-requests/2.28.1",
+    "Go-http-client/1.1",
+    "Googlebot/2.1 (+http://www.google.com/bot.html)",
+];
+
+fn generate_log_lines(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(456);
+    (0..n)
+        .map(|_| {
+            let ip = LOG_IPS[rng.random_range(0..LOG_IPS.len())];
+            let method = LOG_METHODS[rng.random_range(0..LOG_METHODS.len())];
+            let path = LOG_PATHS[rng.random_range(0..LOG_PATHS.len())];
+            let status = LOG_STATUS[rng.random_range(0..LOG_STATUS.len())];
+            let size = rng.random_range(100..50000);
+            let ua = LOG_UAS[rng.random_range(0..LOG_UAS.len())];
+            format!(
+                r#"{ip} - - [15/Mar/2024:10:{:02}:{:02} +0000] "{method} {path} HTTP/1.1" {status} {size} "-" "{ua}""#,
+                rng.random_range(0..60u32),
+                rng.random_range(0..60u32),
+            )
+        })
+        .collect()
+}
+
+fn make_fsst_log_lines(n: usize) -> FSSTArray {
+    let lines = generate_log_lines(n);
+    let varbin = VarBinArray::from_iter(
+        lines.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+const LOG_NEEDLE: &[u8] = b"Googlebot";
+
+// ---------------------------------------------------------------------------
+// JSON strings generator (typical API response payloads)
+// ---------------------------------------------------------------------------
+
+const JSON_NAMES: &[&str] = &[
+    "Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack",
+];
+const JSON_CITIES: &[&str] = &[
+    "New York",
+    "London",
+    "Tokyo",
+    "Berlin",
+    "Sydney",
+    "Toronto",
+    "Paris",
+    "Mumbai",
+    "São Paulo",
+    "Seoul",
+];
+const JSON_TAGS: &[&str] = &[
+    "premium",
+    "verified",
+    "admin",
+    "moderator",
+    "subscriber",
+    "trial",
+    "enterprise",
+    "developer",
+];
+
+fn generate_json_strings(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(789);
+    (0..n)
+        .map(|_| {
+            let name = JSON_NAMES[rng.random_range(0..JSON_NAMES.len())];
+            let city = JSON_CITIES[rng.random_range(0..JSON_CITIES.len())];
+            let age = rng.random_range(18..80u32);
+            let tag1 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())];
+            let tag2 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())];
+            let id = rng.random_range(10000..99999u32);
+            format!(
+                r#"{{"id":{id},"name":"{name}","age":{age},"city":"{city}","tags":["{tag1}","{tag2}"],"active":true}}"#
+            )
+        })
+        .collect()
+}
+
+fn make_fsst_json_strings(n: usize) -> FSSTArray {
+    let jsons = generate_json_strings(n);
+    let varbin = VarBinArray::from_iter(
+        jsons.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+const JSON_NEEDLE: &[u8] = b"enterprise";
+
+// ---------------------------------------------------------------------------
+// File paths generator (Unix-style paths with various depths)
+// ---------------------------------------------------------------------------
+
+const PATH_ROOTS: &[&str] = &[
+    "/home/user",
+    "/var/log",
+    "/etc",
+    "/usr/local/bin",
+    "/opt/app",
+    "/tmp",
+    "/srv/www",
+    "/data/warehouse",
+];
+const PATH_DIRS: &[&str] = &[
+    "src",
+    "build",
+    "dist",
+    "node_modules",
+    "target/release",
+    "config",
+    ".cache",
+    "logs/2024",
+    "backups/daily",
+    "migrations",
+];
+const PATH_FILES: &[&str] = &[
+    "main.rs",
+    "index.ts",
+    "config.yaml",
+    "Dockerfile",
+    "schema.sql",
+    "app.log",
+    "data.parquet",
+    "model.onnx",
+    "README.md",
+    "package.json",
+];
+
+fn generate_file_paths(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(321);
+    (0..n)
+        .map(|_| {
+            let root = PATH_ROOTS[rng.random_range(0..PATH_ROOTS.len())];
+            let dir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())];
+            let file = PATH_FILES[rng.random_range(0..PATH_FILES.len())];
+            let depth = rng.random_range(0..3u32);
+            let mut path = format!("{root}/{dir}");
+            for _ in 0..depth {
+                let subdir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())];
+                path.push('/');
+                path.push_str(subdir);
+            }
+            path.push('/');
+            path.push_str(file);
+            path
+        })
+        .collect()
+}
+
+fn make_fsst_file_paths(n: usize) -> FSSTArray {
+    let paths = generate_file_paths(n);
+    let varbin = VarBinArray::from_iter(
+        paths.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+const PATH_NEEDLE: &[u8] = b"target/release";
+
+// ---------------------------------------------------------------------------
+// Email addresses generator
+// ---------------------------------------------------------------------------
+
+const EMAIL_USERS: &[&str] = &[
+    "john.doe",
+    "jane.smith",
+    "admin",
+    "support",
+    "no-reply",
+    "sales.team",
+    "dev+test",
+    "marketing",
+    "info",
+    "contact.us",
+];
+const EMAIL_DOMAINS: &[&str] = &[
+    "gmail.com",
+    "yahoo.com",
+    "outlook.com",
+    "company.io",
+    "example.org",
+    "mail.ru",
+    "protonmail.com",
+    "fastmail.com",
+    "icloud.com",
+    "hey.com",
+];
+
+fn generate_emails(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(654);
+    (0..n)
+        .map(|_| {
+            let user = EMAIL_USERS[rng.random_range(0..EMAIL_USERS.len())];
+            let domain = EMAIL_DOMAINS[rng.random_range(0..EMAIL_DOMAINS.len())];
+            let suffix = rng.random_range(0..1000u32);
+            format!("{user}{suffix}@{domain}")
+        })
+        .collect()
+}
+
+fn make_fsst_emails(n: usize) -> FSSTArray {
+    let emails = generate_emails(n);
+    let varbin = VarBinArray::from_iter(
+        emails.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+const EMAIL_NEEDLE: &[u8] = b"gmail";
+
+/// Macro to reduce boilerplate for DFA benchmarks with pre-allocated output.
+macro_rules! dfa_bench {
+    ($name:ident, $dfa_ty:ident, $run_fn:ident) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = make_fsst_urls(N);
+            let prep = PreparedArray::from_fsst(&fsst);
+            let dfa = $dfa_ty::new(
+                fsst.symbols().as_slice(),
+                fsst.symbol_lengths().as_slice(),
+                NEEDLE,
+            );
+            let mut out = BitBufferMut::new_unset(N);
+            bencher.bench_local(|| {
+                out.fill_range(0, N, false);
+                $run_fn(&dfa, &prep, &mut out);
+            });
+        }
+    };
+}
+
+// 1. Split table (production baseline)
+dfa_bench!(split_table, SplitTableDfa, run_split);
+
+// 2. Fused 256-wide table
+dfa_bench!(fused_table, FusedTableDfa, run_fused);
+
+// 3. Fused table, no early exit on accept
+dfa_bench!(fused_no_early_exit, FusedTableDfa, run_fused_no_exit);
+
+// 4. Fused table, unsafe (no bounds checks)
+dfa_bench!(fused_unsafe, FusedTableDfa, run_fused_unsafe);
+
+// 5. Fused table, no early exit + unsafe
+dfa_bench!(
+    fused_no_exit_unsafe,
+    FusedTableDfa,
+    run_fused_no_exit_unsafe
+);
+
+// 6. Branchless escape handling
+dfa_bench!(branchless_escape, BranchlessEscapeDfa, run_branchless);
+
+// 7. SIMD gather (8 strings at a time, u32 table)
+#[cfg(target_arch = "x86_64")]
+#[divan::bench]
+fn simd_gather_8(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = SimdGatherDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        run_simd_gather_8(&dfa, &prep, &mut out);
+    });
+}
+
+// 8. Decompress then search (worst-case baseline, allocates per string)
+#[divan::bench]
+fn decompress_then_search(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, NEEDLE, &mut out);
+    });
+}
+
+// 8b. Alloc-free decompress + sliding window match
+#[divan::bench]
+fn decompress_no_alloc(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let symbols = fsst.symbols();
+    let symbol_lengths = fsst.symbol_lengths();
+    let mut buf = Vec::with_capacity(256);
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        run_decompress_match(
+            &prep,
+            symbols.as_slice(),
+            symbol_lengths.as_slice(),
+            NEEDLE,
+            &mut buf,
+            &mut out,
+        );
+    });
+}
+
+// 8c. Alloc-free decompress + memmem (SIMD substring search)
+#[divan::bench]
+fn decompress_no_alloc_memmem(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let symbols = fsst.symbols();
+    let symbol_lengths = fsst.symbol_lengths();
+    let mut buf = Vec::with_capacity(256);
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        run_decompress_memmem(
+            &prep,
+            symbols.as_slice(),
+            symbol_lengths.as_slice(),
+            NEEDLE,
+            &mut buf,
+            &mut out,
+        );
+    });
+}
+
+// 9. Chunk-of-64: match 64 strings, stack-alloc results, then pack bits.
+// This aligns with collect_bool's internal 64-bit chunking.
+#[divan::bench]
+fn fused_chunk_64(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = FusedTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// 10. Chunk-of-64 with unsafe matches.
+#[divan::bench]
+fn fused_chunk_64_unsafe(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = FusedTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+// 11. Compact u8 table (halved table size)
+dfa_bench!(compact_table, CompactDfa, run_compact);
+
+// 12. Compact u8 + collect_bool
+#[divan::bench]
+fn compact_chunk_64(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = CompactDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// 13. Compact u8 + collect_bool + unsafe
+#[divan::bench]
+fn compact_chunk_64_unsafe(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = CompactDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+// 14. Prefilter (skip strings with no relevant codes)
+dfa_bench!(prefilter, PrefilterDfa, run_prefilter);
+
+// 15. Prefilter + collect_bool
+#[divan::bench]
+fn prefilter_chunk_64(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = PrefilterDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// 16. Streaming continuous scan (single pass through all codes)
+#[divan::bench]
+fn streaming_continuous(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = CompactDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        streaming_scan_continuous(&dfa, &prep.all_bytes, &prep.offsets, prep.n, &mut out);
+    });
+}
+
+// 17. Shift-based DFA (u64 packed transitions)
+#[divan::bench]
+fn shift_dfa(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = ShiftDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// 18. Shift-based DFA, no early exit
+#[divan::bench]
+fn shift_dfa_no_exit(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = ShiftDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// 19. Sheng DFA (PSHUFB transitions)
+#[cfg(target_arch = "x86_64")]
+#[divan::bench]
+fn sheng_dfa(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = ShengDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+// 20. Sheng DFA, no early exit
+#[cfg(target_arch = "x86_64")]
+#[divan::bench]
+fn sheng_dfa_no_exit(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = ShengDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches_no_early_exit(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+// 21. Enumerated DFA (track all start states)
+#[divan::bench]
+fn enumerated_dfa(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = EnumeratedDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// 12. Multi-string early exit with bitmask (8 at a time)
+#[divan::bench]
+fn fused_multi_early_exit_8(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = FusedTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        let mut i = 0;
+        while i + 8 <= prep.n {
+            let starts: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k]);
+            let ends: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k + 1]);
+            let results = dfa.matches_multi_early_exit(&prep.all_bytes, &starts, &ends);
+            for k in 0..8 {
+                if results[k] {
+                    out.set(i + k);
+                }
+            }
+            i += 8;
+        }
+        while i < prep.n {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            if dfa.matches(&prep.all_bytes[start..end]) {
+                out.set(i);
+            }
+            i += 1;
+        }
+    });
+}
+
+// Aho-Corasick on decompressed data: decompress each string then search with aho-corasick
+#[divan::bench]
+fn aho_corasick_decompress(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let ac = AhoCorasick::new([NEEDLE]).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    ac.is_match(&decompressed)
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// Aho-Corasick on raw (canonicalized) bytes: decompress the whole array up front,
+// then search each string using aho-corasick's SIMD-accelerated search
+#[divan::bench]
+fn aho_corasick_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let ac = AhoCorasick::new([NEEDLE]).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => ac.is_match(bytes),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// 13. Original collect_bool approach (includes alloc)
+#[divan::bench]
+fn split_table_collect_bool(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = SplitTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// ---------------------------------------------------------------------------
+// ClickBench-style URL benchmarks (longer URLs with query params, fragments)
+// ---------------------------------------------------------------------------
+
+#[divan::bench]
+fn cb_split_table(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = SplitTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_fused_table(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = FusedTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_fused_chunk_64(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = FusedTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_fused_chunk_64_unsafe(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = FusedTableDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_shift_dfa(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = ShiftDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+#[cfg(target_arch = "x86_64")]
+#[divan::bench]
+fn cb_sheng_dfa(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = ShengDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches_no_early_exit(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_compact_chunk_64_unsafe(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = CompactDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_prefilter_chunk_64(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = PrefilterDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_streaming_continuous(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = CompactDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        streaming_scan_continuous(&dfa, &prep.all_bytes, &prep.offsets, prep.n, &mut out);
+    });
+}
+
+#[divan::bench]
+fn cb_decompress_then_search(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, CB_NEEDLE, &mut out);
+    });
+}
+
+#[divan::bench]
+fn cb_decompress_no_alloc(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let symbols = fsst.symbols();
+    let symbol_lengths = fsst.symbol_lengths();
+    let mut buf = Vec::with_capacity(512);
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        run_decompress_match(
+            &prep,
+            symbols.as_slice(),
+            symbol_lengths.as_slice(),
+            CB_NEEDLE,
+            &mut buf,
+            &mut out,
+        );
+    });
+}
+
+#[divan::bench]
+fn cb_decompress_no_alloc_memmem(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let symbols = fsst.symbols();
+    let symbol_lengths = fsst.symbol_lengths();
+    let mut buf = Vec::with_capacity(512);
+    let mut out = BitBufferMut::new_unset(N);
+    bencher.bench_local(|| {
+        out.fill_range(0, N, false);
+        run_decompress_memmem(
+            &prep,
+            symbols.as_slice(),
+            symbol_lengths.as_slice(),
+            CB_NEEDLE,
+            &mut buf,
+            &mut out,
+        );
+    });
+}
+
+#[divan::bench]
+fn cb_aho_corasick_decompress(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let ac = AhoCorasick::new([CB_NEEDLE]).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    ac.is_match(&decompressed)
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn cb_aho_corasick_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let ac = AhoCorasick::new([CB_NEEDLE]).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => ac.is_match(bytes),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Benchmarks for additional data types (log lines, JSON, file paths, emails)
+// ---------------------------------------------------------------------------
+
+/// Macro for benchmarks on a specific data generator + needle combo.
+macro_rules! data_bench {
+    ($name:ident, $make_fn:ident, $needle:expr, $dfa_ty:ident, $match_method:ident) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = $make_fn(N);
+            let prep = PreparedArray::from_fsst(&fsst);
+            let dfa = $dfa_ty::new(
+                fsst.symbols().as_slice(),
+                fsst.symbol_lengths().as_slice(),
+                $needle,
+            );
+            bencher.bench_local(|| {
+                BitBufferMut::collect_bool(prep.n, |i| {
+                    let start = prep.offsets[i];
+                    let end = prep.offsets[i + 1];
+                    dfa.$match_method(&prep.all_bytes[start..end])
+                })
+            });
+        }
+    };
+}
+
+// Log lines: long strings (~150 chars), low match rate for "Googlebot"
+data_bench!(
+    log_split_table,
+    make_fsst_log_lines,
+    LOG_NEEDLE,
+    SplitTableDfa,
+    matches
+);
+data_bench!(
+    log_shift_dfa,
+    make_fsst_log_lines,
+    LOG_NEEDLE,
+    ShiftDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    log_compact_no_exit,
+    make_fsst_log_lines,
+    LOG_NEEDLE,
+    CompactDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    log_fused_no_exit,
+    make_fsst_log_lines,
+    LOG_NEEDLE,
+    FusedTableDfa,
+    matches_no_early_exit
+);
+
+#[divan::bench]
+fn log_decompress(bencher: Bencher) {
+    let fsst = make_fsst_log_lines(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, LOG_NEEDLE, &mut out);
+    });
+}
+
+// JSON strings: structured data (~80-100 chars), searching for "enterprise"
+data_bench!(
+    json_split_table,
+    make_fsst_json_strings,
+    JSON_NEEDLE,
+    SplitTableDfa,
+    matches
+);
+data_bench!(
+    json_shift_dfa,
+    make_fsst_json_strings,
+    JSON_NEEDLE,
+    ShiftDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    json_compact_no_exit,
+    make_fsst_json_strings,
+    JSON_NEEDLE,
+    CompactDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    json_fused_no_exit,
+    make_fsst_json_strings,
+    JSON_NEEDLE,
+    FusedTableDfa,
+    matches_no_early_exit
+);
+
+#[divan::bench]
+fn json_decompress(bencher: Bencher) {
+    let fsst = make_fsst_json_strings(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, JSON_NEEDLE, &mut out);
+    });
+}
+
+// File paths: medium-length (~40-80 chars), searching for "target/release"
+data_bench!(
+    path_split_table,
+    make_fsst_file_paths,
+    PATH_NEEDLE,
+    SplitTableDfa,
+    matches
+);
+data_bench!(
+    path_shift_dfa,
+    make_fsst_file_paths,
+    PATH_NEEDLE,
+    ShiftDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    path_compact_no_exit,
+    make_fsst_file_paths,
+    PATH_NEEDLE,
+    CompactDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    path_fused_no_exit,
+    make_fsst_file_paths,
+    PATH_NEEDLE,
+    FusedTableDfa,
+    matches_no_early_exit
+);
+
+#[divan::bench]
+fn path_decompress(bencher: Bencher) {
+    let fsst = make_fsst_file_paths(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, PATH_NEEDLE, &mut out);
+    });
+}
+
+// Email addresses: short strings (~20-30 chars), searching for "gmail"
+data_bench!(
+    email_split_table,
+    make_fsst_emails,
+    EMAIL_NEEDLE,
+    SplitTableDfa,
+    matches
+);
+data_bench!(
+    email_shift_dfa,
+    make_fsst_emails,
+    EMAIL_NEEDLE,
+    ShiftDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    email_compact_no_exit,
+    make_fsst_emails,
+    EMAIL_NEEDLE,
+    CompactDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    email_fused_no_exit,
+    make_fsst_emails,
+    EMAIL_NEEDLE,
+    FusedTableDfa,
+    matches_no_early_exit
+);
+
+#[divan::bench]
+fn email_decompress(bencher: Bencher) {
+    let fsst = make_fsst_emails(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, EMAIL_NEEDLE, &mut out);
+    });
+}
+
+// ---------------------------------------------------------------------------
+// memchr::memmem benchmarks — SIMD-accelerated substring search on decompressed data
+// ---------------------------------------------------------------------------
+
+#[divan::bench]
+fn memmem_decompress_urls(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let finder = memmem::Finder::new(NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    finder.find(&decompressed).is_some()
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn memmem_on_raw_bytes_urls(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let finder = memmem::Finder::new(NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => finder.find(bytes).is_some(),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn cb_memmem_decompress(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let finder = memmem::Finder::new(CB_NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    finder.find(&decompressed).is_some()
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn cb_memmem_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let finder = memmem::Finder::new(CB_NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => finder.find(bytes).is_some(),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Low match rate (~0.001%) benchmarks — needle appears in ~1/100K strings.
+// Tests performance when almost no string matches (common in large datasets).
+// Uses random alphanumeric strings with a rare injected match.
+// ---------------------------------------------------------------------------
+
+const RARE_NEEDLE: &[u8] = b"xyzzy";
+
+/// Generate N random alphanumeric strings (~40 chars each), injecting the needle
+/// into approximately `match_rate` fraction of them.
+fn generate_rare_match_strings(n: usize, match_rate: f64) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(999);
+    let charset: &[u8] = b"abcdefghijklmnopqrstuvwABCDEFGHIJKLMNOPQRSTUVW0123456789-_.:/";
+    (0..n)
+        .map(|_| {
+            let len = rng.random_range(30..60);
+            let mut s: String = (0..len)
+                .map(|_| charset[rng.random_range(0..charset.len())] as char)
+                .collect();
+            if rng.random_bool(match_rate) {
+                // Inject needle at random position
+                let pos = rng.random_range(0..s.len().saturating_sub(RARE_NEEDLE.len()) + 1);
+                s.replace_range(
+                    pos..pos + RARE_NEEDLE.len().min(s.len() - pos),
+                    std::str::from_utf8(RARE_NEEDLE).unwrap(),
+                );
+            }
+            s
+        })
+        .collect()
+}
+
+fn make_fsst_rare_match(n: usize) -> FSSTArray {
+    let strings = generate_rare_match_strings(n, 0.00001); // ~0.001%
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+data_bench!(
+    rare_split_table,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    SplitTableDfa,
+    matches
+);
+data_bench!(
+    rare_shift_dfa,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    ShiftDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    rare_compact_no_exit,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    CompactDfa,
+    matches_no_early_exit
+);
+data_bench!(
+    rare_fused_no_exit,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    FusedTableDfa,
+    matches_no_early_exit
+);
+
+#[divan::bench]
+fn rare_decompress(bencher: Bencher) {
+    let fsst = make_fsst_rare_match(N);
+    let mut out = Vec::with_capacity(N);
+    bencher.bench_local(|| {
+        bench_decompress(&fsst, RARE_NEEDLE, &mut out);
+    });
+}
+
+#[divan::bench]
+fn rare_memmem_decompress(bencher: Bencher) {
+    let fsst = make_fsst_rare_match(N);
+    let finder = memmem::Finder::new(RARE_NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    finder.find(&decompressed).is_some()
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn rare_prefilter(bencher: Bencher) {
+    let fsst = make_fsst_rare_match(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = PrefilterDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        RARE_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+data_bench!(
+    rare_state_zero_skip,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    StateZeroSkipDfa,
+    matches
+);
+
+// State-zero skip on URLs (moderate match rate)
+data_bench!(
+    state_zero_skip_urls,
+    make_fsst_urls,
+    NEEDLE,
+    StateZeroSkipDfa,
+    matches
+);
+
+// State-zero skip on ClickBench URLs
+#[divan::bench]
+fn cb_state_zero_skip(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = StateZeroSkipDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Alloc-free decompress benchmarks for all data types
+// ---------------------------------------------------------------------------
+
+macro_rules! decompress_no_alloc_bench {
+    ($name:ident, $make_fn:ident, $needle:expr, $bufsz:expr) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = $make_fn(N);
+            let prep = PreparedArray::from_fsst(&fsst);
+            let symbols = fsst.symbols();
+            let symbol_lengths = fsst.symbol_lengths();
+            let mut buf = Vec::with_capacity($bufsz);
+            let mut out = BitBufferMut::new_unset(N);
+            bencher.bench_local(|| {
+                out.fill_range(0, N, false);
+                run_decompress_memmem(
+                    &prep,
+                    symbols.as_slice(),
+                    symbol_lengths.as_slice(),
+                    $needle,
+                    &mut buf,
+                    &mut out,
+                );
+            });
+        }
+    };
+}
+
+decompress_no_alloc_bench!(
+    log_decompress_no_alloc,
+    make_fsst_log_lines,
+    LOG_NEEDLE,
+    256
+);
+decompress_no_alloc_bench!(
+    json_decompress_no_alloc,
+    make_fsst_json_strings,
+    JSON_NEEDLE,
+    256
+);
+decompress_no_alloc_bench!(
+    path_decompress_no_alloc,
+    make_fsst_file_paths,
+    PATH_NEEDLE,
+    256
+);
+decompress_no_alloc_bench!(
+    email_decompress_no_alloc,
+    make_fsst_emails,
+    EMAIL_NEEDLE,
+    64
+);
+decompress_no_alloc_bench!(
+    rare_decompress_no_alloc,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    128
+);
+
+// ---------------------------------------------------------------------------
+// regex-automata DFA benchmarks
+// ---------------------------------------------------------------------------
+
+#[divan::bench]
+fn regex_automata_dense_decompress(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let re = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    re.is_match(&decompressed)
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn regex_automata_dense_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let re = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => re.is_match(bytes),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn regex_automata_sparse_decompress(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let dense = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
+    let (fwd, rev) = (
+        dense.forward().to_sparse().unwrap(),
+        dense.reverse().to_sparse().unwrap(),
+    );
+    let re = regex_automata::dfa::regex::Regex::builder().build_from_dfas(fwd, rev);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    re.is_match(&decompressed)
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn regex_automata_sparse_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let dense = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
+    let (fwd, rev) = (
+        dense.forward().to_sparse().unwrap(),
+        dense.reverse().to_sparse().unwrap(),
+    );
+    let re = regex_automata::dfa::regex::Regex::builder().build_from_dfas(fwd, rev);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => re.is_match(bytes),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// ---------------------------------------------------------------------------
+// jetscii benchmarks — PCMPESTRI-based substring search
+// ---------------------------------------------------------------------------
+
+#[divan::bench]
+fn jetscii_decompress(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let finder = jetscii::ByteSubstring::new(NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    finder.find(&decompressed).is_some()
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn jetscii_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let finder = jetscii::ByteSubstring::new(NEEDLE);
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => finder.find(bytes).is_some(),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// ---------------------------------------------------------------------------
+// daachorse benchmarks — double-array Aho-Corasick
+// ---------------------------------------------------------------------------
+
+#[divan::bench]
+fn daachorse_decompress(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let ac = DoubleArrayAhoCorasick::<u32>::new([NEEDLE]).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        let decompressor = fsst.decompressor();
+        fsst.codes().with_iterator(|iter| {
+            out.extend(iter.map(|codes| match codes {
+                Some(c) => {
+                    let decompressed = decompressor.decompress(c);
+                    ac.find_iter(&decompressed).next().is_some()
+                }
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+#[divan::bench]
+fn daachorse_on_raw_bytes(bencher: Bencher) {
+    let fsst = make_fsst_urls(N);
+    let canonical = fsst.to_canonical().unwrap().into_varbinview();
+    let ac = DoubleArrayAhoCorasick::<u32>::new([NEEDLE]).unwrap();
+    bencher.bench_local(|| {
+        let mut out = Vec::with_capacity(N);
+        canonical.with_iterator(|iter| {
+            out.extend(iter.map(|s| match s {
+                Some(bytes) => ac.find_iter(bytes).next().is_some(),
+                None => false,
+            }));
+        });
+        out
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Hybrid DFA benchmarks
+// ---------------------------------------------------------------------------
+
+data_bench!(
+    prefilter_shift_urls,
+    make_fsst_urls,
+    NEEDLE,
+    PrefilterShiftDfa,
+    matches
+);
+data_bench!(
+    prefilter_shift_rare,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    PrefilterShiftDfa,
+    matches
+);
+data_bench!(
+    state_zero_shift_urls,
+    make_fsst_urls,
+    NEEDLE,
+    StateZeroShiftDfa,
+    matches
+);
+data_bench!(
+    state_zero_shift_rare,
+    make_fsst_rare_match,
+    RARE_NEEDLE,
+    StateZeroShiftDfa,
+    matches
+);
+
+#[divan::bench]
+fn cb_prefilter_shift(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = PrefilterShiftDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+#[divan::bench]
+fn cb_state_zero_shift(bencher: Bencher) {
+    let fsst = make_fsst_clickbench_urls(N);
+    let prep = PreparedArray::from_fsst(&fsst);
+    let dfa = StateZeroShiftDfa::new(
+        fsst.symbols().as_slice(),
+        fsst.symbol_lengths().as_slice(),
+        CB_NEEDLE,
+    );
+    bencher.bench_local(|| {
+        BitBufferMut::collect_bool(prep.n, |i| {
+            let start = prep.offsets[i];
+            let end = prep.offsets[i + 1];
+            dfa.matches(&prep.all_bytes[start..end])
+        })
+    });
+}
+
+// ---------------------------------------------------------------------------
+// Decompress-only benchmarks (no search) — measures the raw cost of FSST
+// decompression for each dataset. Compare against DFA search on compressed
+// codes to see the speedup from avoiding decompression entirely.
+// ---------------------------------------------------------------------------
+
+/// Decompress all strings without searching. Measures pure decompression cost.
+#[inline(never)]
+fn run_decompress_only(
+    prep: &PreparedArray,
+    symbols: &[Symbol],
+    symbol_lengths: &[u8],
+    buf: &mut Vec<u8>,
+) {
+    for i in 0..prep.n {
+        let start = prep.offsets[i];
+        let end = prep.offsets[i + 1];
+        decompress_into(&prep.all_bytes[start..end], symbols, symbol_lengths, buf);
+        // Force the compiler not to optimize away the decompression.
+        std::hint::black_box(buf.len());
+    }
+}
+
+macro_rules! decompress_only_bench {
+    ($name:ident, $make_fn:ident, $bufsz:expr) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = $make_fn(N);
+            let prep = PreparedArray::from_fsst(&fsst);
+            let symbols = fsst.symbols();
+            let symbol_lengths = fsst.symbol_lengths();
+            let mut buf = Vec::with_capacity($bufsz);
+            bencher.bench_local(|| {
+                run_decompress_only(
+                    &prep,
+                    symbols.as_slice(),
+                    symbol_lengths.as_slice(),
+                    &mut buf,
+                );
+            });
+        }
+    };
+}
+
+decompress_only_bench!(urls_decompress_only, make_fsst_urls, 256);
+decompress_only_bench!(cb_decompress_only, make_fsst_clickbench_urls, 512);
+decompress_only_bench!(log_decompress_only, make_fsst_log_lines, 256);
+decompress_only_bench!(json_decompress_only, make_fsst_json_strings, 256);
+decompress_only_bench!(path_decompress_only, make_fsst_file_paths, 256);
+decompress_only_bench!(email_decompress_only, make_fsst_emails, 64);
+decompress_only_bench!(rare_decompress_only, make_fsst_rare_match, 128);
+
+// ---------------------------------------------------------------------------
+// Vortex array LIKE kernel benchmarks — end-to-end through the full vortex
+// execution framework. This measures the production code path including
+// array construction, kernel dispatch, and result materialization.
+// ---------------------------------------------------------------------------
+
+use std::sync::LazyLock;
+
+use vortex_array::Canonical;
+use vortex_array::IntoArray;
+use vortex_array::VortexSessionExecute;
+use vortex_array::arrays::ConstantArray;
+use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
+use vortex_array::scalar_fn::fns::like::Like;
+use vortex_array::scalar_fn::fns::like::LikeOptions;
+use vortex_array::session::ArraySession;
+use vortex_session::VortexSession;
+
+static SESSION: LazyLock<VortexSession> =
+    LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
+
+macro_rules! vortex_like_bench {
+    ($name:ident, $make_fn:ident, $pattern:expr) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = $make_fn(N);
+            let len = fsst.len();
+            let arr = fsst.into_array();
+            let pattern = ConstantArray::new($pattern, len).into_array();
+            bencher.bench_local(|| {
+                Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
+                    .unwrap()
+                    .into_array()
+                    .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+                    .unwrap()
+            });
+        }
+    };
+}
+
+vortex_like_bench!(vortex_like_urls, make_fsst_urls, "%google%");
+vortex_like_bench!(vortex_like_cb, make_fsst_clickbench_urls, "%yandex%");
+vortex_like_bench!(vortex_like_log, make_fsst_log_lines, "%Googlebot%");
+vortex_like_bench!(vortex_like_json, make_fsst_json_strings, "%enterprise%");
+vortex_like_bench!(vortex_like_path, make_fsst_file_paths, "%target/release%");
+vortex_like_bench!(vortex_like_email, make_fsst_emails, "%gmail%");
+vortex_like_bench!(vortex_like_rare, make_fsst_rare_match, "%xyzzy%");
+
+// Arrow LIKE benchmarks: decompress FSST → canonical, then run Arrow's LIKE
+// (which uses memchr::memmem for %needle% patterns).
+macro_rules! arrow_like_bench {
+    ($name:ident, $make_fn:ident, $pattern:expr) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = $make_fn(N);
+            let len = fsst.len();
+            // Pre-decompress to canonical (VarBinViewArray)
+            let canonical = fsst
+                .into_array()
+                .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+                .unwrap()
+                .into_array();
+            let pattern = ConstantArray::new($pattern, len).into_array();
+            bencher.bench_local(|| {
+                Like.try_new_array(
+                    len,
+                    LikeOptions::default(),
+                    [canonical.clone(), pattern.clone()],
+                )
+                .unwrap()
+                .into_array()
+                .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+                .unwrap()
+            });
+        }
+    };
+}
+
+arrow_like_bench!(arrow_like_urls, make_fsst_urls, "%google%");
+arrow_like_bench!(arrow_like_cb, make_fsst_clickbench_urls, "%yandex%");
+arrow_like_bench!(arrow_like_log, make_fsst_log_lines, "%Googlebot%");
+arrow_like_bench!(arrow_like_json, make_fsst_json_strings, "%enterprise%");
+arrow_like_bench!(arrow_like_rare, make_fsst_rare_match, "%xyzzy%");
+
+// End-to-end: decompress + arrow LIKE (measures total cost including decompression)
+macro_rules! e2e_arrow_like_bench {
+    ($name:ident, $make_fn:ident, $pattern:expr) => {
+        #[divan::bench]
+        fn $name(bencher: Bencher) {
+            let fsst = $make_fn(N);
+            let len = fsst.len();
+            let arr = fsst.into_array();
+            let pattern = ConstantArray::new($pattern, len).into_array();
+            bencher.bench_local(|| {
+                // Decompress inside the timed section
+                let canonical = arr
+                    .clone()
+                    .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+                    .unwrap()
+                    .into_array();
+                Like.try_new_array(len, LikeOptions::default(), [canonical, pattern.clone()])
+                    .unwrap()
+                    .into_array()
+                    .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+                    .unwrap()
+            });
+        }
+    };
+}
+
+e2e_arrow_like_bench!(e2e_arrow_urls, make_fsst_urls, "%google%");
+e2e_arrow_like_bench!(e2e_arrow_cb, make_fsst_clickbench_urls, "%yandex%");
+e2e_arrow_like_bench!(e2e_arrow_log, make_fsst_log_lines, "%Googlebot%");
+e2e_arrow_like_bench!(e2e_arrow_json, make_fsst_json_strings, "%enterprise%");
+e2e_arrow_like_bench!(e2e_arrow_rare, make_fsst_rare_match, "%xyzzy%");

From 9273f56fa46ef4fb95f9ca2431bb53dc3f410d9f Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Fri, 13 Mar 2026 11:42:14 +0000
Subject: [PATCH 2/2] chore[fsst]: more *shared* benchmarks

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                 |   17 -
 encodings/fsst/Cargo.toml                  |    8 +-
 encodings/fsst/benches/fsst_contains.rs    | 3706 +-------------------
 encodings/fsst/benches/fsst_url_compare.rs |   80 +-
 encodings/fsst/src/test_utils.rs           |  525 +++
 5 files changed, 617 insertions(+), 3719 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c728c958ce9..38a21cae5d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1769,12 +1769,6 @@ dependencies = [
  "syn 2.0.117",
 ]
 
-[[package]]
-name = "daachorse"
-version = "1.0.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "63b7ef7a4be509357f4804d0a22e830daddb48f19fd604e4ad32ddce04a94c36"
-
 [[package]]
 name = "darling"
 version = "0.23.0"
@@ -4737,12 +4731,6 @@ dependencies = [
  "glob",
 ]
 
-[[package]]
-name = "jetscii"
-version = "0.5.3"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "47f142fe24a9c9944451e8349de0a56af5f3e7226dc46f3ed4d4ecc0b85af75e"
-
 [[package]]
 name = "jiff"
 version = "0.2.23"
@@ -10157,15 +10145,10 @@ dependencies = [
 name = "vortex-fsst"
 version = "0.1.0"
 dependencies = [
- "aho-corasick",
  "codspeed-divan-compat",
- "daachorse",
  "fsst-rs",
- "jetscii",
- "memchr",
  "prost 0.14.3",
  "rand 0.9.2",
- "regex-automata",
  "rstest",
  "vortex-array",
  "vortex-buffer",
diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml
index 0bcc16b22c9..eb08bbda959 100644
--- a/encodings/fsst/Cargo.toml
+++ b/encodings/fsst/Cargo.toml
@@ -30,27 +30,25 @@ vortex-session = { workspace = true }
 _test-harness = ["dep:rand", "vortex-array/_test-harness"]
 
 [dev-dependencies]
-aho-corasick = { workspace = true }
-daachorse = { workspace = true }
 divan = { workspace = true }
-jetscii = { workspace = true }
-memchr = { workspace = true }
 rand = { workspace = true }
-regex-automata = { workspace = true }
 rstest = { workspace = true }
 vortex-array = { workspace = true, features = ["_test-harness"] }
 
 [[bench]]
 name = "fsst_compress"
 harness = false
+required-features = ["_test-harness"]
 
 [[bench]]
 name = "fsst_contains"
 harness = false
+required-features = ["_test-harness"]
 
 [[bench]]
 name = "fsst_url_compare"
 harness = false
+required-features = ["_test-harness"]
 
 [[bench]]
 name = "chunked_dict_fsst_builder"
diff --git a/encodings/fsst/benches/fsst_contains.rs b/encodings/fsst/benches/fsst_contains.rs
index 187be73cd5b..6885ad0543e 100644
--- a/encodings/fsst/benches/fsst_contains.rs
+++ b/encodings/fsst/benches/fsst_contains.rs
@@ -1,3546 +1,12 @@
 // SPDX-License-Identifier: Apache-2.0
 // SPDX-FileCopyrightText: Copyright the Vortex contributors
 
-#![allow(
-    clippy::unwrap_used,
-    clippy::cast_possible_truncation,
-    clippy::missing_safety_doc
-)]
-
-use aho_corasick::AhoCorasick;
-use daachorse::DoubleArrayAhoCorasick;
-use divan::Bencher;
-use fsst::ESCAPE_CODE;
-use fsst::Symbol;
-use memchr::memmem;
-use rand::Rng;
-use rand::SeedableRng;
-use rand::rngs::StdRng;
-use regex_automata::dfa::regex::Regex as DfaRegex;
-use vortex_array::ToCanonical;
-use vortex_array::accessor::ArrayAccessor;
-use vortex_array::arrays::VarBinArray;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
-use vortex_array::match_each_integer_ptype;
-use vortex_buffer::BitBufferMut;
-use vortex_fsst::FSSTArray;
-use vortex_fsst::fsst_compress;
-use vortex_fsst::fsst_train_compressor;
-
-fn main() {
-    divan::main();
-}
-
-// ---------------------------------------------------------------------------
-// URL generator
-// ---------------------------------------------------------------------------
-
-const DOMAINS: &[&str] = &[
-    "google.com",
-    "facebook.com",
-    "github.com",
-    "stackoverflow.com",
-    "amazon.com",
-    "reddit.com",
-    "twitter.com",
-    "youtube.com",
-    "wikipedia.org",
-    "microsoft.com",
-    "apple.com",
-    "netflix.com",
-    "linkedin.com",
-    "cloudflare.com",
-    "google.co.uk",
-    "docs.google.com",
-    "mail.google.com",
-    "maps.google.com",
-    "news.ycombinator.com",
-    "arxiv.org",
-];
-
-const PATHS: &[&str] = &[
-    "/index.html",
-    "/about",
-    "/search?q=vortex",
-    "/user/profile/settings",
-    "/api/v2/data",
-    "/blog/2024/post",
-    "/products/item/12345",
-    "/docs/reference/guide",
-    "/login",
-    "/dashboard/analytics",
-];
-
-fn generate_urls(n: usize) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(42);
-    (0..n)
-        .map(|_| {
-            let scheme = if rng.random_bool(0.8) {
-                "https"
-            } else {
-                "http"
-            };
-            let domain = DOMAINS[rng.random_range(0..DOMAINS.len())];
-            let path = PATHS[rng.random_range(0..PATHS.len())];
-            format!("{scheme}://{domain}{path}")
-        })
-        .collect()
-}
-
-fn make_fsst_urls(n: usize) -> FSSTArray {
-    let urls = generate_urls(n);
-    let varbin = VarBinArray::from_iter(
-        urls.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-// ---------------------------------------------------------------------------
-// KMP helpers
-// ---------------------------------------------------------------------------
-
-fn kmp_failure_table(needle: &[u8]) -> Vec<usize> {
-    let mut failure = vec![0usize; needle.len()];
-    let mut k = 0;
-    for i in 1..needle.len() {
-        while k > 0 && needle[k] != needle[i] {
-            k = failure[k - 1];
-        }
-        if needle[k] == needle[i] {
-            k += 1;
-        }
-        failure[i] = k;
-    }
-    failure
-}
-
-fn kmp_byte_transitions(needle: &[u8]) -> Vec<u16> {
-    let n_states = needle.len() + 1;
-    let accept = needle.len() as u16;
-    let failure = kmp_failure_table(needle);
-
-    let mut table = vec![0u16; n_states * 256];
-    for state in 0..n_states {
-        for byte in 0..256u16 {
-            if state == needle.len() {
-                table[state * 256 + byte as usize] = accept;
-                continue;
-            }
-            let mut s = state;
-            loop {
-                if byte as u8 == needle[s] {
-                    s += 1;
-                    break;
-                }
-                if s == 0 {
-                    break;
-                }
-                s = failure[s - 1];
-            }
-            table[state * 256 + byte as usize] = s as u16;
-        }
-    }
-    table
-}
-
-// ---------------------------------------------------------------------------
-// Approach 1: Original split-table DFA (baseline from production code)
-// ---------------------------------------------------------------------------
-
-struct SplitTableDfa {
-    symbol_transitions: Vec<u16>,
-    escape_transitions: Vec<u16>,
-    n_symbols: usize,
-    accept_state: u16,
-}
-
-impl SplitTableDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let n_symbols = symbols.len();
-        let accept_state = needle.len() as u16;
-        let n_states = needle.len() + 1;
-
-        let byte_table = kmp_byte_transitions(needle);
-
-        let mut symbol_transitions = vec![0u16; n_states * n_symbols];
-        for state in 0..n_states {
-            for code in 0..n_symbols {
-                if state as u16 == accept_state {
-                    symbol_transitions[state * n_symbols + code] = accept_state;
-                    continue;
-                }
-                let sym = symbols[code].to_u64().to_le_bytes();
-                let sym_len = symbol_lengths[code] as usize;
-                let mut s = state as u16;
-                for &b in &sym[..sym_len] {
-                    if s == accept_state {
-                        break;
-                    }
-                    s = byte_table[s as usize * 256 + b as usize];
-                }
-                symbol_transitions[state * n_symbols + code] = s;
-            }
-        }
-
-        Self {
-            symbol_transitions,
-            escape_transitions: byte_table,
-            n_symbols,
-            accept_state,
-        }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        let mut state = 0u16;
-        let mut pos = 0;
-        while pos < codes.len() {
-            if state == self.accept_state {
-                return true;
-            }
-            let code = codes[pos];
-            pos += 1;
-            if code == ESCAPE_CODE {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                state = self.escape_transitions[state as usize * 256 + b as usize];
-            } else {
-                state = self.symbol_transitions[state as usize * self.n_symbols + code as usize];
-            }
-        }
-        state == self.accept_state
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 2: Fused 256-entry table (unified lookup, sentinel for escapes)
-// ---------------------------------------------------------------------------
-
-struct FusedTableDfa {
-    transitions: Vec<u16>,
-    escape_transitions: Vec<u16>,
-    accept_state: u16,
-    escape_sentinel: u16,
-}
-
-impl FusedTableDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let n_symbols = symbols.len();
-        let accept_state = needle.len() as u16;
-        let n_states = needle.len() + 1;
-        let escape_sentinel = n_states as u16 + 1;
-
-        let byte_table = kmp_byte_transitions(needle);
-
-        let mut symbol_transitions = vec![0u16; n_states * n_symbols];
-        for state in 0..n_states {
-            for code in 0..n_symbols {
-                if state as u16 == accept_state {
-                    symbol_transitions[state * n_symbols + code] = accept_state;
-                    continue;
-                }
-                let sym = symbols[code].to_u64().to_le_bytes();
-                let sym_len = symbol_lengths[code] as usize;
-                let mut s = state as u16;
-                for &b in &sym[..sym_len] {
-                    if s == accept_state {
-                        break;
-                    }
-                    s = byte_table[s as usize * 256 + b as usize];
-                }
-                symbol_transitions[state * n_symbols + code] = s;
-            }
-        }
-
-        let mut transitions = vec![0u16; n_states * 256];
-        for state in 0..n_states {
-            for code in 0..n_symbols {
-                transitions[state * 256 + code] = symbol_transitions[state * n_symbols + code];
-            }
-            transitions[state * 256 + ESCAPE_CODE as usize] = escape_sentinel;
-        }
-
-        Self {
-            transitions,
-            escape_transitions: byte_table,
-            accept_state,
-            escape_sentinel,
-        }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        let mut state = 0u16;
-        let mut pos = 0;
-        while pos < codes.len() {
-            if state == self.accept_state {
-                return true;
-            }
-            let code = codes[pos];
-            pos += 1;
-            let next = self.transitions[state as usize * 256 + code as usize];
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                state = self.escape_transitions[state as usize * 256 + b as usize];
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-
-    /// No early exit — skip the accept_state check inside the loop.
-    /// Only check at the end. The accept state is sticky (transitions to itself),
-    /// so final state == accept means we matched at some point.
-    #[inline]
-    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
-        let mut state = 0u16;
-        let mut pos = 0;
-        while pos < codes.len() {
-            let code = codes[pos];
-            pos += 1;
-            let next = self.transitions[state as usize * 256 + code as usize];
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                state = self.escape_transitions[state as usize * 256 + b as usize];
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-
-    /// Unsafe variant — eliminates bounds checks on table lookups.
-    #[inline]
-    unsafe fn matches_unchecked(&self, codes: &[u8]) -> bool {
-        unsafe {
-            let mut state = 0u16;
-            let mut pos = 0;
-            let transitions = self.transitions.as_ptr();
-            let escape_transitions = self.escape_transitions.as_ptr();
-            let len = codes.len();
-            let codes_ptr = codes.as_ptr();
-
-            while pos < len {
-                if state == self.accept_state {
-                    return true;
-                }
-                let code = *codes_ptr.add(pos);
-                pos += 1;
-                let next = *transitions.add(state as usize * 256 + code as usize);
-                if next == self.escape_sentinel {
-                    if pos >= len {
-                        return false;
-                    }
-                    let b = *codes_ptr.add(pos);
-                    pos += 1;
-                    state = *escape_transitions.add(state as usize * 256 + b as usize);
-                } else {
-                    state = next;
-                }
-            }
-            state == self.accept_state
-        }
-    }
-
-    /// No early exit + unsafe bounds elimination.
-    #[inline]
-    unsafe fn matches_no_exit_unchecked(&self, codes: &[u8]) -> bool {
-        unsafe {
-            let mut state = 0u16;
-            let mut pos = 0;
-            let transitions = self.transitions.as_ptr();
-            let escape_transitions = self.escape_transitions.as_ptr();
-            let len = codes.len();
-            let codes_ptr = codes.as_ptr();
-
-            while pos < len {
-                let code = *codes_ptr.add(pos);
-                pos += 1;
-                let next = *transitions.add(state as usize * 256 + code as usize);
-                if next == self.escape_sentinel {
-                    if pos >= len {
-                        return false;
-                    }
-                    let b = *codes_ptr.add(pos);
-                    pos += 1;
-                    state = *escape_transitions.add(state as usize * 256 + b as usize);
-                } else {
-                    state = next;
-                }
-            }
-            state == self.accept_state
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 3: Fused u32 table for SIMD gather (process 8 strings at once)
-// ---------------------------------------------------------------------------
-
-#[cfg(target_arch = "x86_64")]
-struct SimdGatherDfa {
-    /// u32 transition table, 256 entries per state.
-    transitions: Vec<u32>,
-    /// u32 escape transition table, 256 entries per state.
-    escape_transitions: Vec<u32>,
-    accept_state: u32,
-    escape_sentinel: u32,
-}
-
-#[cfg(target_arch = "x86_64")]
-impl SimdGatherDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
-
-        Self {
-            transitions: fused.transitions.iter().map(|&v| v as u32).collect(),
-            escape_transitions: fused.escape_transitions.iter().map(|&v| v as u32).collect(),
-            accept_state: fused.accept_state as u32,
-            escape_sentinel: fused.escape_sentinel as u32,
-        }
-    }
-
-    /// Scalar fallback using the u32 tables.
-    #[inline]
-    fn matches_scalar(&self, codes: &[u8]) -> bool {
-        let mut state = 0u32;
-        let mut pos = 0;
-        while pos < codes.len() {
-            if state == self.accept_state {
-                return true;
-            }
-            let code = codes[pos];
-            pos += 1;
-            let next = self.transitions[state as usize * 256 + code as usize];
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                state = self.escape_transitions[state as usize * 256 + b as usize];
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-
-    /// Process 8 strings simultaneously using AVX2 gather for transition lookups.
-    ///
-    /// Each iteration loads one code byte from each of 8 strings, computes
-    /// table indices, and uses VPGATHERDD to fetch 8 transitions at once.
-    #[cfg(target_feature = "avx2")]
-    #[inline]
-    unsafe fn matches_8_avx2(
-        &self,
-        all_bytes: &[u8],
-        starts: &[usize; 8],
-        ends: &[usize; 8],
-    ) -> [bool; 8] {
-        unsafe {
-            let transitions_ptr = self.transitions.as_ptr() as *const i32;
-            let escape_ptr = self.escape_transitions.as_ptr() as *const i32;
-            let bytes_ptr = all_bytes.as_ptr();
-            let accept = self.accept_state;
-            let sentinel = self.escape_sentinel;
-
-            let mut states = [0u32; 8];
-            let mut pos: [usize; 8] = *starts;
-            let mut done = [false; 8];
-
-            loop {
-                let mut any_active = false;
-
-                for k in 0..8 {
-                    if done[k] {
-                        continue;
-                    }
-                    if pos[k] >= ends[k] {
-                        done[k] = true;
-                        continue;
-                    }
-                    any_active = true;
-
-                    let code = *bytes_ptr.add(pos[k]);
-                    pos[k] += 1;
-                    let next =
-                        *transitions_ptr.add(states[k] as usize * 256 + code as usize) as u32;
-                    if next == sentinel {
-                        if pos[k] >= ends[k] {
-                            done[k] = true;
-                            continue;
-                        }
-                        let b = *bytes_ptr.add(pos[k]);
-                        pos[k] += 1;
-                        states[k] = *escape_ptr.add(states[k] as usize * 256 + b as usize) as u32;
-                    } else {
-                        states[k] = next;
-                    }
-                    if states[k] == accept {
-                        done[k] = true;
-                    }
-                }
-                if !any_active {
-                    break;
-                }
-            }
-
-            std::array::from_fn(|k| states[k] == accept)
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 4: Branchless escape handling via combined table
-// Instead of branching on escape sentinel, use a "code_advance" table that
-// tells how many bytes to consume (1 for normal, 2 for escape), and a
-// combined table that gives the right state for both cases.
-// ---------------------------------------------------------------------------
-
-struct BranchlessEscapeDfa {
-    /// For each (state, first_byte, second_byte) triple, the next state.
-    /// But 256*256 per state is too large. Instead:
-    /// For non-escape codes: transitions[state * 256 + code] gives next state.
-    /// For escape code: transitions[state * 256 + 255] is unused; we use
-    /// escape_transitions[state * 256 + literal_byte].
-    ///
-    /// The branchless trick: always read the next byte (speculatively).
-    /// Use a conditional move to select between the normal and escape path.
-    transitions: Vec<u16>,
-    escape_transitions: Vec<u16>,
-    /// 1 for normal codes, 2 for ESCAPE_CODE.
-    code_advance: [u8; 256],
-    accept_state: u16,
-}
-
-impl BranchlessEscapeDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
-
-        let mut code_advance = [1u8; 256];
-        code_advance[ESCAPE_CODE as usize] = 2;
-
-        Self {
-            transitions: fused.transitions,
-            escape_transitions: fused.escape_transitions,
-            code_advance,
-            accept_state: fused.accept_state,
-        }
-    }
-
-    /// Branchless escape handling: speculatively read the next byte and
-    /// select between normal and escape transitions using conditional ops.
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        if codes.is_empty() {
-            return self.accept_state == 0;
-        }
-        let mut state = 0u16;
-        let mut pos = 0;
-        let len = codes.len();
-
-        while pos < len {
-            let code = codes[pos];
-            let advance = self.code_advance[code as usize] as usize;
-
-            // Speculatively read the next byte (needed for escapes).
-            // For non-escape codes this read is wasted but harmless.
-            let next_byte = if pos + 1 < len { codes[pos + 1] } else { 0 };
-
-            let normal_next = self.transitions[state as usize * 256 + code as usize];
-            let escape_next = self.escape_transitions[state as usize * 256 + next_byte as usize];
-
-            // Select: if this is an escape code, use escape_next; otherwise normal_next.
-            let is_escape = code == ESCAPE_CODE;
-            state = if is_escape { escape_next } else { normal_next };
-
-            pos += advance;
-
-            if state == self.accept_state {
-                return true;
-            }
-        }
-        state == self.accept_state
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 5: u8 state table — halve table size (u16→u8) since states fit in
-// a byte. Smaller tables = better cache utilization.
-// ---------------------------------------------------------------------------
-
-struct CompactDfa {
-    /// u8 transitions, 256 entries per state.
-    transitions: Vec<u8>,
-    escape_transitions: Vec<u8>,
-    accept_state: u8,
-    escape_sentinel: u8,
-}
-
-impl CompactDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
-        Self {
-            transitions: fused.transitions.iter().map(|&v| v as u8).collect(),
-            escape_transitions: fused.escape_transitions.iter().map(|&v| v as u8).collect(),
-            accept_state: fused.accept_state as u8,
-            escape_sentinel: fused.escape_sentinel as u8,
-        }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        let mut state = 0u8;
-        let mut pos = 0;
-        while pos < codes.len() {
-            if state == self.accept_state {
-                return true;
-            }
-            let code = codes[pos];
-            pos += 1;
-            let next = self.transitions[state as usize * 256 + code as usize];
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                state = self.escape_transitions[state as usize * 256 + b as usize];
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-
-    #[inline]
-    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
-        let mut state = 0u8;
-        let mut pos = 0;
-        while pos < codes.len() {
-            let code = codes[pos];
-            pos += 1;
-            let next = self.transitions[state as usize * 256 + code as usize];
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                state = self.escape_transitions[state as usize * 256 + b as usize];
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-
-    /// Unsafe no-exit variant.
-    #[inline]
-    unsafe fn matches_no_exit_unchecked(&self, codes: &[u8]) -> bool {
-        unsafe {
-            let mut state = 0u8;
-            let mut pos = 0;
-            let transitions = self.transitions.as_ptr();
-            let escape_transitions = self.escape_transitions.as_ptr();
-            let len = codes.len();
-            let codes_ptr = codes.as_ptr();
-
-            while pos < len {
-                let code = *codes_ptr.add(pos);
-                pos += 1;
-                let next = *transitions.add(state as usize * 256 + code as usize);
-                if next == self.escape_sentinel {
-                    if pos >= len {
-                        return false;
-                    }
-                    let b = *codes_ptr.add(pos);
-                    pos += 1;
-                    state = *escape_transitions.add(state as usize * 256 + b as usize);
-                } else {
-                    state = next;
-                }
-            }
-            state == self.accept_state
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 6: Streaming scan — process the ENTIRE codes buffer in one pass,
-// resetting state at string boundaries. Avoids per-string slice overhead
-// and is friendlier to the hardware prefetcher.
-// ---------------------------------------------------------------------------
-
-#[inline(never)]
-#[allow(dead_code)]
-fn streaming_scan_fused(
-    dfa: &FusedTableDfa,
-    all_bytes: &[u8],
-    offsets: &[usize],
-    n: usize,
-) -> BitBufferMut {
-    BitBufferMut::collect_bool(n, |i| {
-        // The collect_bool closure is called sequentially for i=0..n.
-        // We rely on the sequential access pattern being prefetch-friendly.
-        let start = offsets[i];
-        let end = offsets[i + 1];
-        dfa.matches(&all_bytes[start..end])
-    })
-}
-
-/// True streaming: single pass through all_bytes with offset-based reset.
-#[inline(never)]
-fn streaming_scan_continuous(
-    dfa: &CompactDfa,
-    all_bytes: &[u8],
-    offsets: &[usize],
-    n: usize,
-    out: &mut BitBufferMut,
-) {
-    let mut string_idx = 0;
-    let mut state = 0u8;
-    let mut next_boundary = offsets[1];
-    let mut matched = false;
-
-    let mut pos = offsets[0];
-    let total_end = offsets[n];
-
-    while pos < total_end {
-        // Check if we've crossed into a new string.
-        while pos >= next_boundary {
-            // Record result for the just-finished string.
-            if matched || state == dfa.accept_state {
-                out.set(string_idx);
-            }
-            string_idx += 1;
-            if string_idx >= n {
-                return;
-            }
-            state = 0;
-            matched = false;
-            next_boundary = offsets[string_idx + 1];
-        }
-
-        let code = all_bytes[pos];
-        pos += 1;
-        let next = dfa.transitions[state as usize * 256 + code as usize];
-        if next == dfa.escape_sentinel {
-            if pos < next_boundary {
-                let b = all_bytes[pos];
-                pos += 1;
-                state = dfa.escape_transitions[state as usize * 256 + b as usize];
-            }
-        } else {
-            state = next;
-        }
-        if state == dfa.accept_state {
-            matched = true;
-        }
-    }
-
-    // Handle the last string.
-    if string_idx < n && (matched || state == dfa.accept_state) {
-        out.set(string_idx);
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 7: Prefilter — build a bitmask of codes that could possibly
-// contribute to matching the needle. Skip DFA for strings where no code
-// belongs to that set.
-// ---------------------------------------------------------------------------
-
-struct PrefilterDfa {
-    inner: CompactDfa,
-    /// For each code byte (0..255), true if that code could produce any byte
-    /// present in the needle (i.e., the symbol's bytes intersect needle's bytes).
-    relevant_codes: [bool; 256],
-}
-
-impl PrefilterDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let inner = CompactDfa::new(symbols, symbol_lengths, needle);
-
-        // Build set of bytes that appear in the needle.
-        let mut needle_bytes = [false; 256];
-        for &b in needle {
-            needle_bytes[b as usize] = true;
-        }
-
-        // For each symbol code, check if any of its bytes appear in the needle.
-        let mut relevant_codes = [false; 256];
-        for (code, (sym, &sym_len)) in symbols.iter().zip(symbol_lengths.iter()).enumerate() {
-            let sym_bytes = sym.to_u64().to_le_bytes();
-            for &b in &sym_bytes[..sym_len as usize] {
-                if needle_bytes[b as usize] {
-                    relevant_codes[code] = true;
-                    break;
-                }
-            }
-        }
-        // Escape code is always relevant (literal bytes could be anything).
-        relevant_codes[ESCAPE_CODE as usize] = true;
-
-        Self {
-            inner,
-            relevant_codes,
-        }
-    }
-
-    /// Quick check: does this code sequence contain any code that could
-    /// contribute to the needle match?
-    #[inline]
-    fn could_match(&self, codes: &[u8]) -> bool {
-        codes.iter().any(|&c| self.relevant_codes[c as usize])
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        if !self.could_match(codes) {
-            return false;
-        }
-        self.inner.matches(codes)
-    }
-
-    #[inline]
-    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
-        if !self.could_match(codes) {
-            return false;
-        }
-        self.inner.matches_no_early_exit(codes)
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 8: State-zero skip DFA — skip runs of codes that keep state=0.
-//
-// Precompute a 256-byte lookup: for each code byte, does transitioning from
-// state 0 stay in state 0? If so, that code is "trivial" and can be skipped.
-// Process codes in chunks: scan for the first non-trivial code, then run
-// the scalar DFA from there. This is most effective when the needle is rare
-// (most codes are trivial), which is the common case for selective predicates.
-// ---------------------------------------------------------------------------
-
-struct StateZeroSkipDfa {
-    inner: CompactDfa,
-    /// For each code byte (0..255), true if it keeps state 0 → state 0.
-    trivial: [bool; 256],
-}
-
-impl StateZeroSkipDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let inner = CompactDfa::new(symbols, symbol_lengths, needle);
-
-        let mut trivial = [false; 256];
-        for code in 0..256 {
-            // A code is trivial if from state 0 it goes back to state 0
-            // and it's not the escape sentinel.
-            let next = inner.transitions[code]; // state 0 * 256 + code
-            trivial[code] = next == 0 && code as u8 != ESCAPE_CODE;
-        }
-
-        Self { inner, trivial }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        // Skip leading trivial codes.
-        let mut start = 0;
-        while start < codes.len() && self.trivial[codes[start] as usize] {
-            start += 1;
-        }
-        if start == codes.len() {
-            return self.inner.accept_state == 0;
-        }
-        // Run the DFA from the first non-trivial code.
-        self.inner.matches_no_early_exit(&codes[start..])
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 9: Shift-based DFA — pack all state transitions into a u64.
-//
-// For a DFA with S ≤ 21 states (3 bits each fit in 63 bits of a u64),
-// we store the transitions for ALL states for a given input byte in one u64.
-// Transition: next_state = (table[code_byte] >> (state * BITS)) & MASK
-//
-// The key advantage: the table load depends only on code_byte (known from
-// the input stream), NOT on the current state. This breaks the load-use
-// dependency chain that makes traditional table-lookup DFAs slow (~4 cycle
-// L1 latency per transition). With the shift-based approach, the table
-// value can be loaded while the previous transition's shift is executing.
-// ---------------------------------------------------------------------------
-
-struct ShiftDfa {
-    /// For each code byte (0..255): a u64 packing all state transitions.
-    /// Bits [state*3 .. state*3+3) encode the next state for that input.
-    transitions: [u64; 256],
-    /// Same layout for escape byte transitions.
-    escape_transitions: [u64; 256],
-    accept_state: u8,
-    escape_sentinel: u8,
-}
-
-impl ShiftDfa {
-    const BITS: u32 = 4; // bits per state (supports up to 16 states = 2^4)
-    const MASK: u64 = (1 << Self::BITS) - 1;
-
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        assert!(
-            needle.len() + 2 <= (1 << Self::BITS),
-            "needle too long for 4-bit states (max 14 chars)"
-        );
-
-        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
-
-        // Pack the fused u16 transitions into u64 shift tables.
-        let n_states = needle.len() + 1;
-        let escape_sentinel_u8 = fused.escape_sentinel as u8;
-
-        let mut transitions = [0u64; 256];
-        let mut escape_transitions = [0u64; 256];
-
-        for code_byte in 0..256usize {
-            let mut packed = 0u64;
-            for state in 0..n_states {
-                let next = fused.transitions[state * 256 + code_byte];
-                // Map the escape sentinel to a value that fits in 3 bits.
-                let next_u8 = if next == fused.escape_sentinel {
-                    escape_sentinel_u8
-                } else {
-                    next as u8
-                };
-                packed |= (next_u8 as u64) << (state as u32 * Self::BITS);
-            }
-            transitions[code_byte] = packed;
-        }
-
-        for byte_val in 0..256usize {
-            let mut packed = 0u64;
-            for state in 0..n_states {
-                let next = fused.escape_transitions[state * 256 + byte_val] as u8;
-                packed |= (next as u64) << (state as u32 * Self::BITS);
-            }
-            escape_transitions[byte_val] = packed;
-        }
-
-        Self {
-            transitions,
-            escape_transitions,
-            accept_state: fused.accept_state as u8,
-            escape_sentinel: escape_sentinel_u8,
-        }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        let mut state = 0u8;
-        let mut pos = 0;
-        while pos < codes.len() {
-            if state == self.accept_state {
-                return true;
-            }
-            let code = codes[pos];
-            pos += 1;
-            // The table load depends only on `code`, not on `state`.
-            // The shift depends on `state` but is a fast register op.
-            let packed = self.transitions[code as usize];
-            let next = ((packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                let esc_packed = self.escape_transitions[b as usize];
-                state = ((esc_packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-
-    #[inline]
-    fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
-        let mut state = 0u8;
-        let mut pos = 0;
-        while pos < codes.len() {
-            let code = codes[pos];
-            pos += 1;
-            let packed = self.transitions[code as usize];
-            let next = ((packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
-            if next == self.escape_sentinel {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                let esc_packed = self.escape_transitions[b as usize];
-                state = ((esc_packed >> (state as u32 * Self::BITS)) & Self::MASK) as u8;
-            } else {
-                state = next;
-            }
-        }
-        state == self.accept_state
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Hybrid 1: Prefilter + ShiftDfa — skip strings with no relevant codes,
-// then use the fastest DFA (ShiftDfa) for survivors.
-// ---------------------------------------------------------------------------
-
-struct PrefilterShiftDfa {
-    inner: ShiftDfa,
-    relevant_codes: [bool; 256],
-}
-
-impl PrefilterShiftDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let inner = ShiftDfa::new(symbols, symbol_lengths, needle);
-
-        let mut needle_bytes = [false; 256];
-        for &b in needle {
-            needle_bytes[b as usize] = true;
-        }
-
-        let mut relevant_codes = [false; 256];
-        for (code, (sym, &sym_len)) in symbols.iter().zip(symbol_lengths.iter()).enumerate() {
-            let sym_bytes = sym.to_u64().to_le_bytes();
-            for &b in &sym_bytes[..sym_len as usize] {
-                if needle_bytes[b as usize] {
-                    relevant_codes[code] = true;
-                    break;
-                }
-            }
-        }
-        relevant_codes[ESCAPE_CODE as usize] = true;
-
-        Self {
-            inner,
-            relevant_codes,
-        }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        if !codes.iter().any(|&c| self.relevant_codes[c as usize]) {
-            return false;
-        }
-        self.inner.matches_no_early_exit(codes)
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Hybrid 2: StateZero skip + ShiftDfa — skip leading trivial codes,
-// then use ShiftDfa for the remainder.
-// ---------------------------------------------------------------------------
-
-struct StateZeroShiftDfa {
-    inner: ShiftDfa,
-    trivial: [bool; 256],
-}
-
-impl StateZeroShiftDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let inner = ShiftDfa::new(symbols, symbol_lengths, needle);
-
-        let mut trivial = [false; 256];
-        for code in 0..256 {
-            let packed = inner.transitions[code];
-            let next = (packed & ShiftDfa::MASK) as u8;
-            trivial[code] = next == 0 && code as u8 != ESCAPE_CODE;
-        }
-
-        Self { inner, trivial }
-    }
-
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        let mut start = 0;
-        while start < codes.len() && self.trivial[codes[start] as usize] {
-            start += 1;
-        }
-        if start == codes.len() {
-            return self.inner.accept_state == 0;
-        }
-        self.inner.matches_no_early_exit(&codes[start..])
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 9: Sheng DFA — use SSSE3 PSHUFB for transitions.
-//
-// The state is a byte position in an XMM register. For each input byte,
-// we load a 16-byte shuffle mask and do PSHUFB(mask, state_vec).
-// PSHUFB uses the low 4 bits of each byte lane as an index into the mask,
-// producing the next state. With ≤16 states this is a single instruction.
-//
-// The shuffle mask load depends only on the input byte (not on state),
-// so it can be loaded in parallel with the previous PSHUFB's execution.
-// Throughput: ~1 byte/cycle (limited by PSHUFB throughput of 1/cycle on
-// most microarchitectures).
-// ---------------------------------------------------------------------------
-
-#[cfg(target_arch = "x86_64")]
-struct ShengDfa {
-    /// 256 shuffle masks, one per possible input byte.
-    /// Each mask is 16 bytes: mask[i] = next_state when current state == i.
-    masks: Vec<std::arch::x86_64::__m128i>,
-    /// 256 escape masks for escaped byte values.
-    escape_masks: Vec<std::arch::x86_64::__m128i>,
-    accept_state: u8,
-    escape_sentinel: u8,
-}
-
-#[cfg(target_arch = "x86_64")]
-impl ShengDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        use std::arch::x86_64::_mm_set_epi8;
-
-        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
-        let escape_sentinel = fused.escape_sentinel as u8;
-
-        let mut masks = Vec::with_capacity(256);
-        let mut escape_masks = Vec::with_capacity(256);
-
-        for code_byte in 0..256usize {
-            let mut mask_bytes = [0u8; 16];
-            for state in 0..16 {
-                if state < needle.len() + 1 {
-                    let next = fused.transitions[state * 256 + code_byte];
-                    mask_bytes[state] = if next == fused.escape_sentinel {
-                        escape_sentinel
-                    } else {
-                        next as u8
-                    };
-                }
-            }
-            masks.push(unsafe {
-                _mm_set_epi8(
-                    mask_bytes[15] as i8,
-                    mask_bytes[14] as i8,
-                    mask_bytes[13] as i8,
-                    mask_bytes[12] as i8,
-                    mask_bytes[11] as i8,
-                    mask_bytes[10] as i8,
-                    mask_bytes[9] as i8,
-                    mask_bytes[8] as i8,
-                    mask_bytes[7] as i8,
-                    mask_bytes[6] as i8,
-                    mask_bytes[5] as i8,
-                    mask_bytes[4] as i8,
-                    mask_bytes[3] as i8,
-                    mask_bytes[2] as i8,
-                    mask_bytes[1] as i8,
-                    mask_bytes[0] as i8,
-                )
-            });
-        }
-
-        for byte_val in 0..256usize {
-            let mut mask_bytes = [0u8; 16];
-            for state in 0..16 {
-                if state < needle.len() + 1 {
-                    mask_bytes[state] = fused.escape_transitions[state * 256 + byte_val] as u8;
-                }
-            }
-            escape_masks.push(unsafe {
-                _mm_set_epi8(
-                    mask_bytes[15] as i8,
-                    mask_bytes[14] as i8,
-                    mask_bytes[13] as i8,
-                    mask_bytes[12] as i8,
-                    mask_bytes[11] as i8,
-                    mask_bytes[10] as i8,
-                    mask_bytes[9] as i8,
-                    mask_bytes[8] as i8,
-                    mask_bytes[7] as i8,
-                    mask_bytes[6] as i8,
-                    mask_bytes[5] as i8,
-                    mask_bytes[4] as i8,
-                    mask_bytes[3] as i8,
-                    mask_bytes[2] as i8,
-                    mask_bytes[1] as i8,
-                    mask_bytes[0] as i8,
-                )
-            });
-        }
-
-        Self {
-            masks,
-            escape_masks,
-            accept_state: fused.accept_state as u8,
-            escape_sentinel,
-        }
-    }
-
-    #[inline]
-    #[target_feature(enable = "ssse3")]
-    unsafe fn matches(&self, codes: &[u8]) -> bool {
-        use std::arch::x86_64::_mm_extract_epi8;
-        use std::arch::x86_64::_mm_set1_epi8;
-        use std::arch::x86_64::_mm_shuffle_epi8;
-
-        unsafe {
-            let mut state_vec = _mm_set1_epi8(0);
-            let mut pos = 0;
-
-            while pos < codes.len() {
-                let cur_state = _mm_extract_epi8::<0>(state_vec) as u8;
-                if cur_state == self.accept_state {
-                    return true;
-                }
-
-                let code = codes[pos];
-                pos += 1;
-
-                // One PSHUFB: the mask load depends only on `code`, not state.
-                let next_vec = _mm_shuffle_epi8(self.masks[code as usize], state_vec);
-                let next_state = _mm_extract_epi8::<0>(next_vec) as u8;
-
-                if next_state == self.escape_sentinel {
-                    if pos >= codes.len() {
-                        return false;
-                    }
-                    let b = codes[pos];
-                    pos += 1;
-                    state_vec = _mm_shuffle_epi8(self.escape_masks[b as usize], state_vec);
-                } else {
-                    state_vec = next_vec;
-                }
-            }
-
-            _mm_extract_epi8::<0>(state_vec) as u8 == self.accept_state
-        }
-    }
-
-    #[inline]
-    #[target_feature(enable = "ssse3")]
-    unsafe fn matches_no_early_exit(&self, codes: &[u8]) -> bool {
-        use std::arch::x86_64::_mm_extract_epi8;
-        use std::arch::x86_64::_mm_set1_epi8;
-        use std::arch::x86_64::_mm_shuffle_epi8;
-
-        unsafe {
-            let mut state_vec = _mm_set1_epi8(0);
-            let mut pos = 0;
-
-            while pos < codes.len() {
-                let code = codes[pos];
-                pos += 1;
-
-                let next_vec = _mm_shuffle_epi8(self.masks[code as usize], state_vec);
-                let next_state = _mm_extract_epi8::<0>(next_vec) as u8;
-
-                if next_state == self.escape_sentinel {
-                    if pos >= codes.len() {
-                        return false;
-                    }
-                    let b = codes[pos];
-                    pos += 1;
-                    state_vec = _mm_shuffle_epi8(self.escape_masks[b as usize], state_vec);
-                } else {
-                    state_vec = next_vec;
-                }
-            }
-
-            _mm_extract_epi8::<0>(state_vec) as u8 == self.accept_state
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 10: Speculative/Enumerated DFA — run from ALL start states at once.
-//
-// For a DFA with S states and a code sequence of length L, we process codes
-// sequentially but track S states simultaneously. Each "state" in our vector
-// is the result of starting from a different initial state. After processing
-// the full sequence, we look up the result for initial state 0.
-//
-// Why is this useful? It enables processing codes in independent chunks:
-// each chunk can run in parallel, and results are chained by composing
-// the state-to-state mappings. For small S this is very efficient.
-// ---------------------------------------------------------------------------
-
-struct EnumeratedDfa {
-    /// For each (state, code_byte): next state. 256 entries per state.
-    transitions: Vec<u16>,
-    escape_transitions: Vec<u16>,
-    n_states: usize,
-    accept_state: u16,
-    escape_sentinel: u16,
-}
-
-impl EnumeratedDfa {
-    fn new(symbols: &[Symbol], symbol_lengths: &[u8], needle: &[u8]) -> Self {
-        let fused = FusedTableDfa::new(symbols, symbol_lengths, needle);
-        Self {
-            transitions: fused.transitions,
-            escape_transitions: fused.escape_transitions,
-            n_states: needle.len() + 1,
-            accept_state: fused.accept_state,
-            escape_sentinel: fused.escape_sentinel,
-        }
-    }
-
-    /// Process a single code sequence by tracking all possible start states.
-    /// Returns true if starting from state 0 reaches accept.
-    #[inline]
-    fn matches(&self, codes: &[u8]) -> bool {
-        // For each possible start state, track where it ends up.
-        // state_map[s] = "if we started in state s, we'd now be in state state_map[s]"
-        let ns = self.n_states;
-        let mut state_map: [u16; 16] = [0; 16]; // supports up to 16 states
-        for s in 0..ns {
-            state_map[s] = s as u16;
-        }
-
-        let mut pos = 0;
-        while pos < codes.len() {
-            let code = codes[pos];
-            pos += 1;
-
-            let next_fn = self.transitions.as_ptr();
-            let esc_fn = self.escape_transitions.as_ptr();
-
-            if code == ESCAPE_CODE {
-                if pos >= codes.len() {
-                    return false;
-                }
-                let b = codes[pos];
-                pos += 1;
-                for s in 0..ns {
-                    let cur = state_map[s];
-                    state_map[s] = unsafe { *esc_fn.add(cur as usize * 256 + b as usize) };
-                }
-            } else {
-                for s in 0..ns {
-                    let cur = state_map[s];
-                    let next = unsafe { *next_fn.add(cur as usize * 256 + code as usize) };
-                    state_map[s] = if next == self.escape_sentinel {
-                        // shouldn't happen for non-escape codes
-                        cur
-                    } else {
-                        next
-                    };
-                }
-            }
-
-            // Early exit: if starting from state 0 we've already accepted
-            if state_map[0] == self.accept_state {
-                return true;
-            }
-        }
-
-        state_map[0] == self.accept_state
-    }
-
-    /// Chunked parallel version: split codes into chunks, process each chunk
-    #[allow(dead_code)]
-    /// to get a state mapping, then compose mappings.
-    #[inline]
-    fn matches_chunked(&self, codes: &[u8], chunk_size: usize) -> bool {
-        if codes.is_empty() {
-            return self.accept_state == 0;
-        }
-
-        let ns = self.n_states;
-
-        // Process the full sequence but in chunks, building state maps that
-        // could theoretically be parallelized.
-        let mut global_map: [u16; 16] = [0; 16];
-        for s in 0..ns {
-            global_map[s] = s as u16;
-        }
-
-        // We still process sequentially here but the structure allows future
-        // parallelization with rayon/SIMD on independent chunks.
-        let mut pos = 0;
-        while pos < codes.len() {
-            let chunk_end = (pos + chunk_size).min(codes.len());
-
-            // Build mapping for this chunk: for each start state, what's the end state?
-            let mut chunk_map: [u16; 16] = [0; 16];
-            for start_state in 0..ns {
-                let mut state = start_state as u16;
-                let mut p = pos;
-                while p < chunk_end {
-                    let code = codes[p];
-                    p += 1;
-                    let next = self.transitions[state as usize * 256 + code as usize];
-                    if next == self.escape_sentinel {
-                        if p >= chunk_end {
-                            // Escape spans chunk boundary — just do the lookup
-                            // with byte 0 as placeholder, will be corrected
-                            break;
-                        }
-                        let b = codes[p];
-                        p += 1;
-                        state = self.escape_transitions[state as usize * 256 + b as usize];
-                    } else {
-                        state = next;
-                    }
-                }
-                chunk_map[start_state] = state;
-            }
-
-            // Compose: global_map = chunk_map(global_map)
-            let mut new_global: [u16; 16] = [0; 16];
-            for s in 0..ns {
-                new_global[s] = chunk_map[global_map[s] as usize];
-            }
-            global_map = new_global;
-
-            pos = chunk_end;
-        }
-
-        global_map[0] == self.accept_state
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Approach 6: Speculative multi-string — process multiple strings, each with
-// early-exit SIMD checking across the batch after each code step.
-// ---------------------------------------------------------------------------
-
-impl FusedTableDfa {
-    /// Process N strings at once. After each code step, check if ALL strings
-    /// have resolved (accepted or exhausted). Uses u16 states packed for
-    /// potential SIMD comparison.
-    #[inline]
-    fn matches_multi_early_exit<const N: usize>(
-        &self,
-        all_bytes: &[u8],
-        starts: &[usize; N],
-        ends: &[usize; N],
-    ) -> [bool; N] {
-        let mut states = [0u16; N];
-        let mut pos = *starts;
-        let mut resolved = 0u32; // bitmask of resolved strings
-
-        let all_resolved = (1u32 << N) - 1;
-
-        loop {
-            if resolved == all_resolved {
-                break;
-            }
-
-            let mut any_progress = false;
-            for k in 0..N {
-                if resolved & (1 << k) != 0 {
-                    continue;
-                }
-                if pos[k] >= ends[k] {
-                    resolved |= 1 << k;
-                    continue;
-                }
-                any_progress = true;
-
-                let code = all_bytes[pos[k]];
-                pos[k] += 1;
-                let next = self.transitions[states[k] as usize * 256 + code as usize];
-                if next == self.escape_sentinel {
-                    if pos[k] >= ends[k] {
-                        resolved |= 1 << k;
-                        continue;
-                    }
-                    let b = all_bytes[pos[k]];
-                    pos[k] += 1;
-                    states[k] = self.escape_transitions[states[k] as usize * 256 + b as usize];
-                } else {
-                    states[k] = next;
-                }
-                if states[k] == self.accept_state {
-                    resolved |= 1 << k;
-                }
-            }
-            if !any_progress {
-                break;
-            }
-        }
-
-        std::array::from_fn(|k| states[k] == self.accept_state)
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Pre-extracted data for alloc-free benchmarking
-// ---------------------------------------------------------------------------
-
-struct PreparedArray {
-    all_bytes: Vec<u8>,
-    offsets: Vec<usize>,
-    n: usize,
-}
-
-impl PreparedArray {
-    fn from_fsst(array: &FSSTArray) -> Self {
-        let codes = array.codes();
-        let offsets_prim = codes.offsets().to_primitive();
-        let all_bytes = codes.bytes();
-        let all_bytes = all_bytes.as_slice().to_vec();
-        let n = codes.len();
-
-        let offsets: Vec<usize> = match_each_integer_ptype!(offsets_prim.ptype(), |T| {
-            offsets_prim
-                .as_slice::<T>()
-                .iter()
-                .map(|&v| v as usize)
-                .collect()
-        });
-
-        Self {
-            all_bytes,
-            offsets,
-            n,
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Benchmark helpers
-// ---------------------------------------------------------------------------
-
-#[inline(never)]
-fn run_split(dfa: &SplitTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-    }
-}
-
-#[inline(never)]
-fn run_fused(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-    }
-}
-
-#[inline(never)]
-fn run_fused_no_exit(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches_no_early_exit(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-    }
-}
-
-#[inline(never)]
-fn run_fused_unsafe(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if unsafe { dfa.matches_unchecked(&prep.all_bytes[start..end]) } {
-            out.set(i);
-        }
-    }
-}
-
-#[inline(never)]
-fn run_fused_no_exit_unsafe(dfa: &FusedTableDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) } {
-            out.set(i);
-        }
-    }
-}
-
-#[inline(never)]
-fn run_branchless(dfa: &BranchlessEscapeDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-    }
-}
-
-#[cfg(target_arch = "x86_64")]
-#[inline(never)]
-fn run_simd_gather_8(dfa: &SimdGatherDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    let mut i = 0;
-    while i + 8 <= prep.n {
-        let starts: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k]);
-        let ends: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k + 1]);
-
-        #[cfg(target_feature = "avx2")]
-        let results = unsafe { dfa.matches_8_avx2(&prep.all_bytes, &starts, &ends) };
-        #[cfg(not(target_feature = "avx2"))]
-        let results = {
-            let mut r = [false; 8];
-            for k in 0..8 {
-                r[k] = dfa.matches_scalar(&prep.all_bytes[starts[k]..ends[k]]);
-            }
-            r
-        };
-
-        for k in 0..8 {
-            if results[k] {
-                out.set(i + k);
-            }
-        }
-        i += 8;
-    }
-    // Remainder
-    while i < prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches_scalar(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-        i += 1;
-    }
-}
-
-#[inline(never)]
-fn run_compact(dfa: &CompactDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-    }
-}
-
-#[inline(never)]
-fn run_prefilter(dfa: &PrefilterDfa, prep: &PreparedArray, out: &mut BitBufferMut) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        if dfa.matches(&prep.all_bytes[start..end]) {
-            out.set(i);
-        }
-    }
-}
-
-fn bench_decompress(array: &FSSTArray, needle: &[u8], out: &mut Vec<bool>) {
-    out.clear();
-    let decompressor = array.decompressor();
-    array.codes().with_iterator(|iter| {
-        out.extend(iter.map(|codes| match codes {
-            Some(c) => {
-                let decompressed = decompressor.decompress(c);
-                decompressed.windows(needle.len()).any(|w| w == needle)
-            }
-            None => false,
-        }));
-    });
-}
-
-// ---------------------------------------------------------------------------
-// Alloc-free decompress + match: reuse a buffer, inline the decompress logic.
-// This measures pure decompress+search cost without per-string allocation.
-// ---------------------------------------------------------------------------
-
-/// Decompress FSST codes into `buf`, returning the number of bytes written.
-/// This avoids all allocation by writing into a caller-provided buffer.
-#[inline]
-fn decompress_into(codes: &[u8], symbols: &[Symbol], symbol_lengths: &[u8], buf: &mut Vec<u8>) {
-    buf.clear();
-    let mut pos = 0;
-    while pos < codes.len() {
-        let code = codes[pos];
-        pos += 1;
-        if code == ESCAPE_CODE {
-            if pos < codes.len() {
-                buf.push(codes[pos]);
-                pos += 1;
-            }
-        } else {
-            let sym = symbols[code as usize].to_u64().to_le_bytes();
-            let len = symbol_lengths[code as usize] as usize;
-            buf.extend_from_slice(&sym[..len]);
-        }
-    }
-}
-
-/// Alloc-free decompress + sliding window match using PreparedArray.
-/// Pre-allocates the decompression buffer once outside the benchmark loop.
-#[inline(never)]
-fn run_decompress_match(
-    prep: &PreparedArray,
-    symbols: &[Symbol],
-    symbol_lengths: &[u8],
-    needle: &[u8],
-    buf: &mut Vec<u8>,
-    out: &mut BitBufferMut,
-) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        decompress_into(&prep.all_bytes[start..end], symbols, symbol_lengths, buf);
-        if buf.windows(needle.len()).any(|w| w == needle) {
-            out.set(i);
-        }
-    }
-}
-
-/// Alloc-free decompress + memmem match using PreparedArray.
-#[inline(never)]
-fn run_decompress_memmem(
-    prep: &PreparedArray,
-    symbols: &[Symbol],
-    symbol_lengths: &[u8],
-    needle: &[u8],
-    buf: &mut Vec<u8>,
-    out: &mut BitBufferMut,
-) {
-    let finder = memmem::Finder::new(needle);
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        decompress_into(&prep.all_bytes[start..end], symbols, symbol_lengths, buf);
-        if finder.find(buf).is_some() {
-            out.set(i);
-        }
-    }
-}
-
-// ---------------------------------------------------------------------------
-// Benchmarks
-// ---------------------------------------------------------------------------
-
-const N: usize = 100_000;
-const NEEDLE: &[u8] = b"google";
-
-// ---------------------------------------------------------------------------
-// ClickBench-style URL generator (longer, more realistic URLs with query
-// params, fragments, UTM tracking, referrers, etc.)
-// ---------------------------------------------------------------------------
-
-const CB_DOMAINS: &[&str] = &[
-    "www.google.com",
-    "yandex.ru",
-    "mail.ru",
-    "vk.com",
-    "www.youtube.com",
-    "www.facebook.com",
-    "ok.ru",
-    "go.mail.ru",
-    "www.avito.ru",
-    "pogoda.yandex.ru",
-    "news.yandex.ru",
-    "maps.yandex.ru",
-    "market.yandex.ru",
-    "afisha.yandex.ru",
-    "auto.ru",
-    "www.kinopoisk.ru",
-    "www.ozon.ru",
-    "www.wildberries.ru",
-    "aliexpress.ru",
-    "lenta.ru",
-];
-
-const CB_PATHS: &[&str] = &[
-    "/search",
-    "/catalog/electronics/smartphones",
-    "/product/item/123456789",
-    "/news/2024/03/15/article-about-technology",
-    "/user/profile/settings/notifications",
-    "/api/v2/catalog/search",
-    "/checkout/cart/summary",
-    "/blog/2024/how-to-optimize-database-queries-for-better-performance",
-    "/category/home-and-garden/furniture/tables",
-    "/",
-];
-
-const CB_PARAMS: &[&str] = &[
-    "?utm_source=google&utm_medium=cpc&utm_campaign=spring_sale_2024&utm_content=banner_v2",
-    "?q=buy+smartphone+online+cheap+free+shipping&category=electronics&sort=price_asc&page=3",
-    "?ref=main_page_carousel_block_position_4&sessionid=abc123def456",
-    "?from=tabbar&clid=2270455&text=weather+forecast+tomorrow",
-    "?lr=213&msid=1234567890.12345&suggest_reqid=abcdef&csg=12345",
-    "",
-    "",
-    "",
-    "?page=1&per_page=20",
-    "?source=serp&forceshow=1",
-];
-
-const CB_FRAGMENTS: &[&str] = &[
-    "",
-    "",
-    "",
-    "#section-reviews",
-    "#comments",
-    "#price-history",
-    "",
-    "",
-    "",
-    "",
-];
-
-fn generate_clickbench_urls(n: usize) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(123);
-    (0..n)
-        .map(|_| {
-            let scheme = if rng.random_bool(0.7) {
-                "https"
-            } else {
-                "http"
-            };
-            let domain = CB_DOMAINS[rng.random_range(0..CB_DOMAINS.len())];
-            let path = CB_PATHS[rng.random_range(0..CB_PATHS.len())];
-            let params = CB_PARAMS[rng.random_range(0..CB_PARAMS.len())];
-            let fragment = CB_FRAGMENTS[rng.random_range(0..CB_FRAGMENTS.len())];
-            format!("{scheme}://{domain}{path}{params}{fragment}")
-        })
-        .collect()
-}
-
-fn make_fsst_clickbench_urls(n: usize) -> FSSTArray {
-    let urls = generate_clickbench_urls(n);
-    let varbin = VarBinArray::from_iter(
-        urls.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-const CB_NEEDLE: &[u8] = b"yandex";
-
-// ---------------------------------------------------------------------------
-// Log lines generator (Apache/nginx-style access logs)
-// ---------------------------------------------------------------------------
-
-const LOG_METHODS: &[&str] = &["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD"];
-const LOG_PATHS: &[&str] = &[
-    "/api/v1/users",
-    "/api/v2/products/search",
-    "/healthcheck",
-    "/static/js/app.bundle.min.js",
-    "/favicon.ico",
-    "/login",
-    "/dashboard/analytics",
-    "/api/v1/orders/12345/status",
-    "/graphql",
-    "/metrics",
-];
-const LOG_STATUS: &[u16] = &[
-    200, 200, 200, 200, 200, 201, 301, 302, 400, 403, 404, 500, 502,
-];
-const LOG_IPS: &[&str] = &[
-    "192.168.1.1",
-    "10.0.0.42",
-    "172.16.0.100",
-    "203.0.113.50",
-    "198.51.100.23",
-    "8.8.8.8",
-    "1.1.1.1",
-    "74.125.200.100",
-    "151.101.1.69",
-    "93.184.216.34",
-];
-const LOG_UAS: &[&str] = &[
-    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
-    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
-    "curl/7.81.0",
-    "python-requests/2.28.1",
-    "Go-http-client/1.1",
-    "Googlebot/2.1 (+http://www.google.com/bot.html)",
-];
-
-fn generate_log_lines(n: usize) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(456);
-    (0..n)
-        .map(|_| {
-            let ip = LOG_IPS[rng.random_range(0..LOG_IPS.len())];
-            let method = LOG_METHODS[rng.random_range(0..LOG_METHODS.len())];
-            let path = LOG_PATHS[rng.random_range(0..LOG_PATHS.len())];
-            let status = LOG_STATUS[rng.random_range(0..LOG_STATUS.len())];
-            let size = rng.random_range(100..50000);
-            let ua = LOG_UAS[rng.random_range(0..LOG_UAS.len())];
-            format!(
-                r#"{ip} - - [15/Mar/2024:10:{:02}:{:02} +0000] "{method} {path} HTTP/1.1" {status} {size} "-" "{ua}""#,
-                rng.random_range(0..60u32),
-                rng.random_range(0..60u32),
-            )
-        })
-        .collect()
-}
-
-fn make_fsst_log_lines(n: usize) -> FSSTArray {
-    let lines = generate_log_lines(n);
-    let varbin = VarBinArray::from_iter(
-        lines.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-const LOG_NEEDLE: &[u8] = b"Googlebot";
-
-// ---------------------------------------------------------------------------
-// JSON strings generator (typical API response payloads)
-// ---------------------------------------------------------------------------
-
-const JSON_NAMES: &[&str] = &[
-    "Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack",
-];
-const JSON_CITIES: &[&str] = &[
-    "New York",
-    "London",
-    "Tokyo",
-    "Berlin",
-    "Sydney",
-    "Toronto",
-    "Paris",
-    "Mumbai",
-    "São Paulo",
-    "Seoul",
-];
-const JSON_TAGS: &[&str] = &[
-    "premium",
-    "verified",
-    "admin",
-    "moderator",
-    "subscriber",
-    "trial",
-    "enterprise",
-    "developer",
-];
-
-fn generate_json_strings(n: usize) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(789);
-    (0..n)
-        .map(|_| {
-            let name = JSON_NAMES[rng.random_range(0..JSON_NAMES.len())];
-            let city = JSON_CITIES[rng.random_range(0..JSON_CITIES.len())];
-            let age = rng.random_range(18..80u32);
-            let tag1 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())];
-            let tag2 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())];
-            let id = rng.random_range(10000..99999u32);
-            format!(
-                r#"{{"id":{id},"name":"{name}","age":{age},"city":"{city}","tags":["{tag1}","{tag2}"],"active":true}}"#
-            )
-        })
-        .collect()
-}
-
-fn make_fsst_json_strings(n: usize) -> FSSTArray {
-    let jsons = generate_json_strings(n);
-    let varbin = VarBinArray::from_iter(
-        jsons.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-const JSON_NEEDLE: &[u8] = b"enterprise";
-
-// ---------------------------------------------------------------------------
-// File paths generator (Unix-style paths with various depths)
-// ---------------------------------------------------------------------------
-
-const PATH_ROOTS: &[&str] = &[
-    "/home/user",
-    "/var/log",
-    "/etc",
-    "/usr/local/bin",
-    "/opt/app",
-    "/tmp",
-    "/srv/www",
-    "/data/warehouse",
-];
-const PATH_DIRS: &[&str] = &[
-    "src",
-    "build",
-    "dist",
-    "node_modules",
-    "target/release",
-    "config",
-    ".cache",
-    "logs/2024",
-    "backups/daily",
-    "migrations",
-];
-const PATH_FILES: &[&str] = &[
-    "main.rs",
-    "index.ts",
-    "config.yaml",
-    "Dockerfile",
-    "schema.sql",
-    "app.log",
-    "data.parquet",
-    "model.onnx",
-    "README.md",
-    "package.json",
-];
-
-fn generate_file_paths(n: usize) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(321);
-    (0..n)
-        .map(|_| {
-            let root = PATH_ROOTS[rng.random_range(0..PATH_ROOTS.len())];
-            let dir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())];
-            let file = PATH_FILES[rng.random_range(0..PATH_FILES.len())];
-            let depth = rng.random_range(0..3u32);
-            let mut path = format!("{root}/{dir}");
-            for _ in 0..depth {
-                let subdir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())];
-                path.push('/');
-                path.push_str(subdir);
-            }
-            path.push('/');
-            path.push_str(file);
-            path
-        })
-        .collect()
-}
-
-fn make_fsst_file_paths(n: usize) -> FSSTArray {
-    let paths = generate_file_paths(n);
-    let varbin = VarBinArray::from_iter(
-        paths.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-const PATH_NEEDLE: &[u8] = b"target/release";
-
-// ---------------------------------------------------------------------------
-// Email addresses generator
-// ---------------------------------------------------------------------------
-
-const EMAIL_USERS: &[&str] = &[
-    "john.doe",
-    "jane.smith",
-    "admin",
-    "support",
-    "no-reply",
-    "sales.team",
-    "dev+test",
-    "marketing",
-    "info",
-    "contact.us",
-];
-const EMAIL_DOMAINS: &[&str] = &[
-    "gmail.com",
-    "yahoo.com",
-    "outlook.com",
-    "company.io",
-    "example.org",
-    "mail.ru",
-    "protonmail.com",
-    "fastmail.com",
-    "icloud.com",
-    "hey.com",
-];
-
-fn generate_emails(n: usize) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(654);
-    (0..n)
-        .map(|_| {
-            let user = EMAIL_USERS[rng.random_range(0..EMAIL_USERS.len())];
-            let domain = EMAIL_DOMAINS[rng.random_range(0..EMAIL_DOMAINS.len())];
-            let suffix = rng.random_range(0..1000u32);
-            format!("{user}{suffix}@{domain}")
-        })
-        .collect()
-}
-
-fn make_fsst_emails(n: usize) -> FSSTArray {
-    let emails = generate_emails(n);
-    let varbin = VarBinArray::from_iter(
-        emails.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-const EMAIL_NEEDLE: &[u8] = b"gmail";
-
-/// Macro to reduce boilerplate for DFA benchmarks with pre-allocated output.
-macro_rules! dfa_bench {
-    ($name:ident, $dfa_ty:ident, $run_fn:ident) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = make_fsst_urls(N);
-            let prep = PreparedArray::from_fsst(&fsst);
-            let dfa = $dfa_ty::new(
-                fsst.symbols().as_slice(),
-                fsst.symbol_lengths().as_slice(),
-                NEEDLE,
-            );
-            let mut out = BitBufferMut::new_unset(N);
-            bencher.bench_local(|| {
-                out.fill_range(0, N, false);
-                $run_fn(&dfa, &prep, &mut out);
-            });
-        }
-    };
-}
-
-// 1. Split table (production baseline)
-dfa_bench!(split_table, SplitTableDfa, run_split);
-
-// 2. Fused 256-wide table
-dfa_bench!(fused_table, FusedTableDfa, run_fused);
-
-// 3. Fused table, no early exit on accept
-dfa_bench!(fused_no_early_exit, FusedTableDfa, run_fused_no_exit);
-
-// 4. Fused table, unsafe (no bounds checks)
-dfa_bench!(fused_unsafe, FusedTableDfa, run_fused_unsafe);
-
-// 5. Fused table, no early exit + unsafe
-dfa_bench!(
-    fused_no_exit_unsafe,
-    FusedTableDfa,
-    run_fused_no_exit_unsafe
-);
-
-// 6. Branchless escape handling
-dfa_bench!(branchless_escape, BranchlessEscapeDfa, run_branchless);
-
-// 7. SIMD gather (8 strings at a time, u32 table)
-#[cfg(target_arch = "x86_64")]
-#[divan::bench]
-fn simd_gather_8(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = SimdGatherDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        run_simd_gather_8(&dfa, &prep, &mut out);
-    });
-}
-
-// 8. Decompress then search (worst-case baseline, allocates per string)
-#[divan::bench]
-fn decompress_then_search(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, NEEDLE, &mut out);
-    });
-}
-
-// 8b. Alloc-free decompress + sliding window match
-#[divan::bench]
-fn decompress_no_alloc(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let symbols = fsst.symbols();
-    let symbol_lengths = fsst.symbol_lengths();
-    let mut buf = Vec::with_capacity(256);
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        run_decompress_match(
-            &prep,
-            symbols.as_slice(),
-            symbol_lengths.as_slice(),
-            NEEDLE,
-            &mut buf,
-            &mut out,
-        );
-    });
-}
-
-// 8c. Alloc-free decompress + memmem (SIMD substring search)
-#[divan::bench]
-fn decompress_no_alloc_memmem(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let symbols = fsst.symbols();
-    let symbol_lengths = fsst.symbol_lengths();
-    let mut buf = Vec::with_capacity(256);
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        run_decompress_memmem(
-            &prep,
-            symbols.as_slice(),
-            symbol_lengths.as_slice(),
-            NEEDLE,
-            &mut buf,
-            &mut out,
-        );
-    });
-}
-
-// 9. Chunk-of-64: match 64 strings, stack-alloc results, then pack bits.
-// This aligns with collect_bool's internal 64-bit chunking.
-#[divan::bench]
-fn fused_chunk_64(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = FusedTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// 10. Chunk-of-64 with unsafe matches.
-#[divan::bench]
-fn fused_chunk_64_unsafe(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = FusedTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-// 11. Compact u8 table (halved table size)
-dfa_bench!(compact_table, CompactDfa, run_compact);
-
-// 12. Compact u8 + collect_bool
-#[divan::bench]
-fn compact_chunk_64(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = CompactDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// 13. Compact u8 + collect_bool + unsafe
-#[divan::bench]
-fn compact_chunk_64_unsafe(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = CompactDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-// 14. Prefilter (skip strings with no relevant codes)
-dfa_bench!(prefilter, PrefilterDfa, run_prefilter);
-
-// 15. Prefilter + collect_bool
-#[divan::bench]
-fn prefilter_chunk_64(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = PrefilterDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// 16. Streaming continuous scan (single pass through all codes)
-#[divan::bench]
-fn streaming_continuous(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = CompactDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        streaming_scan_continuous(&dfa, &prep.all_bytes, &prep.offsets, prep.n, &mut out);
-    });
-}
-
-// 17. Shift-based DFA (u64 packed transitions)
-#[divan::bench]
-fn shift_dfa(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = ShiftDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// 18. Shift-based DFA, no early exit
-#[divan::bench]
-fn shift_dfa_no_exit(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = ShiftDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// 19. Sheng DFA (PSHUFB transitions)
-#[cfg(target_arch = "x86_64")]
-#[divan::bench]
-fn sheng_dfa(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = ShengDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-// 20. Sheng DFA, no early exit
-#[cfg(target_arch = "x86_64")]
-#[divan::bench]
-fn sheng_dfa_no_exit(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = ShengDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches_no_early_exit(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-// 21. Enumerated DFA (track all start states)
-#[divan::bench]
-fn enumerated_dfa(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = EnumeratedDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// 12. Multi-string early exit with bitmask (8 at a time)
-#[divan::bench]
-fn fused_multi_early_exit_8(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = FusedTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        let mut i = 0;
-        while i + 8 <= prep.n {
-            let starts: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k]);
-            let ends: [usize; 8] = std::array::from_fn(|k| prep.offsets[i + k + 1]);
-            let results = dfa.matches_multi_early_exit(&prep.all_bytes, &starts, &ends);
-            for k in 0..8 {
-                if results[k] {
-                    out.set(i + k);
-                }
-            }
-            i += 8;
-        }
-        while i < prep.n {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            if dfa.matches(&prep.all_bytes[start..end]) {
-                out.set(i);
-            }
-            i += 1;
-        }
-    });
-}
-
-// Aho-Corasick on decompressed data: decompress each string then search with aho-corasick
-#[divan::bench]
-fn aho_corasick_decompress(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let ac = AhoCorasick::new([NEEDLE]).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    ac.is_match(&decompressed)
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// Aho-Corasick on raw (canonicalized) bytes: decompress the whole array up front,
-// then search each string using aho-corasick's SIMD-accelerated search
-#[divan::bench]
-fn aho_corasick_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let ac = AhoCorasick::new([NEEDLE]).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => ac.is_match(bytes),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// 13. Original collect_bool approach (includes alloc)
-#[divan::bench]
-fn split_table_collect_bool(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = SplitTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// ---------------------------------------------------------------------------
-// ClickBench-style URL benchmarks (longer URLs with query params, fragments)
-// ---------------------------------------------------------------------------
-
-#[divan::bench]
-fn cb_split_table(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = SplitTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_fused_table(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = FusedTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_fused_chunk_64(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = FusedTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_fused_chunk_64_unsafe(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = FusedTableDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_shift_dfa(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = ShiftDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-#[cfg(target_arch = "x86_64")]
-#[divan::bench]
-fn cb_sheng_dfa(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = ShengDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches_no_early_exit(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_compact_chunk_64_unsafe(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = CompactDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            unsafe { dfa.matches_no_exit_unchecked(&prep.all_bytes[start..end]) }
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_prefilter_chunk_64(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = PrefilterDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_streaming_continuous(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = CompactDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        streaming_scan_continuous(&dfa, &prep.all_bytes, &prep.offsets, prep.n, &mut out);
-    });
-}
-
-#[divan::bench]
-fn cb_decompress_then_search(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, CB_NEEDLE, &mut out);
-    });
-}
-
-#[divan::bench]
-fn cb_decompress_no_alloc(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let symbols = fsst.symbols();
-    let symbol_lengths = fsst.symbol_lengths();
-    let mut buf = Vec::with_capacity(512);
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        run_decompress_match(
-            &prep,
-            symbols.as_slice(),
-            symbol_lengths.as_slice(),
-            CB_NEEDLE,
-            &mut buf,
-            &mut out,
-        );
-    });
-}
-
-#[divan::bench]
-fn cb_decompress_no_alloc_memmem(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let symbols = fsst.symbols();
-    let symbol_lengths = fsst.symbol_lengths();
-    let mut buf = Vec::with_capacity(512);
-    let mut out = BitBufferMut::new_unset(N);
-    bencher.bench_local(|| {
-        out.fill_range(0, N, false);
-        run_decompress_memmem(
-            &prep,
-            symbols.as_slice(),
-            symbol_lengths.as_slice(),
-            CB_NEEDLE,
-            &mut buf,
-            &mut out,
-        );
-    });
-}
-
-#[divan::bench]
-fn cb_aho_corasick_decompress(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let ac = AhoCorasick::new([CB_NEEDLE]).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    ac.is_match(&decompressed)
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn cb_aho_corasick_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let ac = AhoCorasick::new([CB_NEEDLE]).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => ac.is_match(bytes),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// ---------------------------------------------------------------------------
-// Benchmarks for additional data types (log lines, JSON, file paths, emails)
-// ---------------------------------------------------------------------------
-
-/// Macro for benchmarks on a specific data generator + needle combo.
-macro_rules! data_bench {
-    ($name:ident, $make_fn:ident, $needle:expr, $dfa_ty:ident, $match_method:ident) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = $make_fn(N);
-            let prep = PreparedArray::from_fsst(&fsst);
-            let dfa = $dfa_ty::new(
-                fsst.symbols().as_slice(),
-                fsst.symbol_lengths().as_slice(),
-                $needle,
-            );
-            bencher.bench_local(|| {
-                BitBufferMut::collect_bool(prep.n, |i| {
-                    let start = prep.offsets[i];
-                    let end = prep.offsets[i + 1];
-                    dfa.$match_method(&prep.all_bytes[start..end])
-                })
-            });
-        }
-    };
-}
-
-// Log lines: long strings (~150 chars), low match rate for "Googlebot"
-data_bench!(
-    log_split_table,
-    make_fsst_log_lines,
-    LOG_NEEDLE,
-    SplitTableDfa,
-    matches
-);
-data_bench!(
-    log_shift_dfa,
-    make_fsst_log_lines,
-    LOG_NEEDLE,
-    ShiftDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    log_compact_no_exit,
-    make_fsst_log_lines,
-    LOG_NEEDLE,
-    CompactDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    log_fused_no_exit,
-    make_fsst_log_lines,
-    LOG_NEEDLE,
-    FusedTableDfa,
-    matches_no_early_exit
-);
-
-#[divan::bench]
-fn log_decompress(bencher: Bencher) {
-    let fsst = make_fsst_log_lines(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, LOG_NEEDLE, &mut out);
-    });
-}
-
-// JSON strings: structured data (~80-100 chars), searching for "enterprise"
-data_bench!(
-    json_split_table,
-    make_fsst_json_strings,
-    JSON_NEEDLE,
-    SplitTableDfa,
-    matches
-);
-data_bench!(
-    json_shift_dfa,
-    make_fsst_json_strings,
-    JSON_NEEDLE,
-    ShiftDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    json_compact_no_exit,
-    make_fsst_json_strings,
-    JSON_NEEDLE,
-    CompactDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    json_fused_no_exit,
-    make_fsst_json_strings,
-    JSON_NEEDLE,
-    FusedTableDfa,
-    matches_no_early_exit
-);
-
-#[divan::bench]
-fn json_decompress(bencher: Bencher) {
-    let fsst = make_fsst_json_strings(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, JSON_NEEDLE, &mut out);
-    });
-}
-
-// File paths: medium-length (~40-80 chars), searching for "target/release"
-data_bench!(
-    path_split_table,
-    make_fsst_file_paths,
-    PATH_NEEDLE,
-    SplitTableDfa,
-    matches
-);
-data_bench!(
-    path_shift_dfa,
-    make_fsst_file_paths,
-    PATH_NEEDLE,
-    ShiftDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    path_compact_no_exit,
-    make_fsst_file_paths,
-    PATH_NEEDLE,
-    CompactDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    path_fused_no_exit,
-    make_fsst_file_paths,
-    PATH_NEEDLE,
-    FusedTableDfa,
-    matches_no_early_exit
-);
-
-#[divan::bench]
-fn path_decompress(bencher: Bencher) {
-    let fsst = make_fsst_file_paths(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, PATH_NEEDLE, &mut out);
-    });
-}
-
-// Email addresses: short strings (~20-30 chars), searching for "gmail"
-data_bench!(
-    email_split_table,
-    make_fsst_emails,
-    EMAIL_NEEDLE,
-    SplitTableDfa,
-    matches
-);
-data_bench!(
-    email_shift_dfa,
-    make_fsst_emails,
-    EMAIL_NEEDLE,
-    ShiftDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    email_compact_no_exit,
-    make_fsst_emails,
-    EMAIL_NEEDLE,
-    CompactDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    email_fused_no_exit,
-    make_fsst_emails,
-    EMAIL_NEEDLE,
-    FusedTableDfa,
-    matches_no_early_exit
-);
-
-#[divan::bench]
-fn email_decompress(bencher: Bencher) {
-    let fsst = make_fsst_emails(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, EMAIL_NEEDLE, &mut out);
-    });
-}
-
-// ---------------------------------------------------------------------------
-// memchr::memmem benchmarks — SIMD-accelerated substring search on decompressed data
-// ---------------------------------------------------------------------------
-
-#[divan::bench]
-fn memmem_decompress_urls(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let finder = memmem::Finder::new(NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    finder.find(&decompressed).is_some()
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn memmem_on_raw_bytes_urls(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let finder = memmem::Finder::new(NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => finder.find(bytes).is_some(),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn cb_memmem_decompress(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let finder = memmem::Finder::new(CB_NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    finder.find(&decompressed).is_some()
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn cb_memmem_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let finder = memmem::Finder::new(CB_NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => finder.find(bytes).is_some(),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// ---------------------------------------------------------------------------
-// Low match rate (~0.001%) benchmarks — needle appears in ~1/100K strings.
-// Tests performance when almost no string matches (common in large datasets).
-// Uses random alphanumeric strings with a rare injected match.
-// ---------------------------------------------------------------------------
-
-const RARE_NEEDLE: &[u8] = b"xyzzy";
-
-/// Generate N random alphanumeric strings (~40 chars each), injecting the needle
-/// into approximately `match_rate` fraction of them.
-fn generate_rare_match_strings(n: usize, match_rate: f64) -> Vec<String> {
-    let mut rng = StdRng::seed_from_u64(999);
-    let charset: &[u8] = b"abcdefghijklmnopqrstuvwABCDEFGHIJKLMNOPQRSTUVW0123456789-_.:/";
-    (0..n)
-        .map(|_| {
-            let len = rng.random_range(30..60);
-            let mut s: String = (0..len)
-                .map(|_| charset[rng.random_range(0..charset.len())] as char)
-                .collect();
-            if rng.random_bool(match_rate) {
-                // Inject needle at random position
-                let pos = rng.random_range(0..s.len().saturating_sub(RARE_NEEDLE.len()) + 1);
-                s.replace_range(
-                    pos..pos + RARE_NEEDLE.len().min(s.len() - pos),
-                    std::str::from_utf8(RARE_NEEDLE).unwrap(),
-                );
-            }
-            s
-        })
-        .collect()
-}
-
-fn make_fsst_rare_match(n: usize) -> FSSTArray {
-    let strings = generate_rare_match_strings(n, 0.00001); // ~0.001%
-    let varbin = VarBinArray::from_iter(
-        strings.iter().map(|s| Some(s.as_str())),
-        DType::Utf8(Nullability::NonNullable),
-    );
-    let compressor = fsst_train_compressor(&varbin);
-    fsst_compress(varbin, &compressor)
-}
-
-data_bench!(
-    rare_split_table,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    SplitTableDfa,
-    matches
-);
-data_bench!(
-    rare_shift_dfa,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    ShiftDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    rare_compact_no_exit,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    CompactDfa,
-    matches_no_early_exit
-);
-data_bench!(
-    rare_fused_no_exit,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    FusedTableDfa,
-    matches_no_early_exit
-);
-
-#[divan::bench]
-fn rare_decompress(bencher: Bencher) {
-    let fsst = make_fsst_rare_match(N);
-    let mut out = Vec::with_capacity(N);
-    bencher.bench_local(|| {
-        bench_decompress(&fsst, RARE_NEEDLE, &mut out);
-    });
-}
-
-#[divan::bench]
-fn rare_memmem_decompress(bencher: Bencher) {
-    let fsst = make_fsst_rare_match(N);
-    let finder = memmem::Finder::new(RARE_NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    finder.find(&decompressed).is_some()
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn rare_prefilter(bencher: Bencher) {
-    let fsst = make_fsst_rare_match(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = PrefilterDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        RARE_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches_no_early_exit(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-data_bench!(
-    rare_state_zero_skip,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    StateZeroSkipDfa,
-    matches
-);
-
-// State-zero skip on URLs (moderate match rate)
-data_bench!(
-    state_zero_skip_urls,
-    make_fsst_urls,
-    NEEDLE,
-    StateZeroSkipDfa,
-    matches
-);
-
-// State-zero skip on ClickBench URLs
-#[divan::bench]
-fn cb_state_zero_skip(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = StateZeroSkipDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// ---------------------------------------------------------------------------
-// Alloc-free decompress benchmarks for all data types
-// ---------------------------------------------------------------------------
-
-macro_rules! decompress_no_alloc_bench {
-    ($name:ident, $make_fn:ident, $needle:expr, $bufsz:expr) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = $make_fn(N);
-            let prep = PreparedArray::from_fsst(&fsst);
-            let symbols = fsst.symbols();
-            let symbol_lengths = fsst.symbol_lengths();
-            let mut buf = Vec::with_capacity($bufsz);
-            let mut out = BitBufferMut::new_unset(N);
-            bencher.bench_local(|| {
-                out.fill_range(0, N, false);
-                run_decompress_memmem(
-                    &prep,
-                    symbols.as_slice(),
-                    symbol_lengths.as_slice(),
-                    $needle,
-                    &mut buf,
-                    &mut out,
-                );
-            });
-        }
-    };
-}
-
-decompress_no_alloc_bench!(
-    log_decompress_no_alloc,
-    make_fsst_log_lines,
-    LOG_NEEDLE,
-    256
-);
-decompress_no_alloc_bench!(
-    json_decompress_no_alloc,
-    make_fsst_json_strings,
-    JSON_NEEDLE,
-    256
-);
-decompress_no_alloc_bench!(
-    path_decompress_no_alloc,
-    make_fsst_file_paths,
-    PATH_NEEDLE,
-    256
-);
-decompress_no_alloc_bench!(
-    email_decompress_no_alloc,
-    make_fsst_emails,
-    EMAIL_NEEDLE,
-    64
-);
-decompress_no_alloc_bench!(
-    rare_decompress_no_alloc,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    128
-);
-
-// ---------------------------------------------------------------------------
-// regex-automata DFA benchmarks
-// ---------------------------------------------------------------------------
-
-#[divan::bench]
-fn regex_automata_dense_decompress(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let re = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    re.is_match(&decompressed)
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn regex_automata_dense_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let re = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => re.is_match(bytes),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn regex_automata_sparse_decompress(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let dense = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
-    let (fwd, rev) = (
-        dense.forward().to_sparse().unwrap(),
-        dense.reverse().to_sparse().unwrap(),
-    );
-    let re = regex_automata::dfa::regex::Regex::builder().build_from_dfas(fwd, rev);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    re.is_match(&decompressed)
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn regex_automata_sparse_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let dense = DfaRegex::new(std::str::from_utf8(NEEDLE).unwrap()).unwrap();
-    let (fwd, rev) = (
-        dense.forward().to_sparse().unwrap(),
-        dense.reverse().to_sparse().unwrap(),
-    );
-    let re = regex_automata::dfa::regex::Regex::builder().build_from_dfas(fwd, rev);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => re.is_match(bytes),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// ---------------------------------------------------------------------------
-// jetscii benchmarks — PCMPESTRI-based substring search
-// ---------------------------------------------------------------------------
-
-#[divan::bench]
-fn jetscii_decompress(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let finder = jetscii::ByteSubstring::new(NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    finder.find(&decompressed).is_some()
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn jetscii_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let finder = jetscii::ByteSubstring::new(NEEDLE);
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => finder.find(bytes).is_some(),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// ---------------------------------------------------------------------------
-// daachorse benchmarks — double-array Aho-Corasick
-// ---------------------------------------------------------------------------
-
-#[divan::bench]
-fn daachorse_decompress(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let ac = DoubleArrayAhoCorasick::<u32>::new([NEEDLE]).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        let decompressor = fsst.decompressor();
-        fsst.codes().with_iterator(|iter| {
-            out.extend(iter.map(|codes| match codes {
-                Some(c) => {
-                    let decompressed = decompressor.decompress(c);
-                    ac.find_iter(&decompressed).next().is_some()
-                }
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-#[divan::bench]
-fn daachorse_on_raw_bytes(bencher: Bencher) {
-    let fsst = make_fsst_urls(N);
-    let canonical = fsst.to_canonical().unwrap().into_varbinview();
-    let ac = DoubleArrayAhoCorasick::<u32>::new([NEEDLE]).unwrap();
-    bencher.bench_local(|| {
-        let mut out = Vec::with_capacity(N);
-        canonical.with_iterator(|iter| {
-            out.extend(iter.map(|s| match s {
-                Some(bytes) => ac.find_iter(bytes).next().is_some(),
-                None => false,
-            }));
-        });
-        out
-    });
-}
-
-// ---------------------------------------------------------------------------
-// Hybrid DFA benchmarks
-// ---------------------------------------------------------------------------
-
-data_bench!(
-    prefilter_shift_urls,
-    make_fsst_urls,
-    NEEDLE,
-    PrefilterShiftDfa,
-    matches
-);
-data_bench!(
-    prefilter_shift_rare,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    PrefilterShiftDfa,
-    matches
-);
-data_bench!(
-    state_zero_shift_urls,
-    make_fsst_urls,
-    NEEDLE,
-    StateZeroShiftDfa,
-    matches
-);
-data_bench!(
-    state_zero_shift_rare,
-    make_fsst_rare_match,
-    RARE_NEEDLE,
-    StateZeroShiftDfa,
-    matches
-);
-
-#[divan::bench]
-fn cb_prefilter_shift(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = PrefilterShiftDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-#[divan::bench]
-fn cb_state_zero_shift(bencher: Bencher) {
-    let fsst = make_fsst_clickbench_urls(N);
-    let prep = PreparedArray::from_fsst(&fsst);
-    let dfa = StateZeroShiftDfa::new(
-        fsst.symbols().as_slice(),
-        fsst.symbol_lengths().as_slice(),
-        CB_NEEDLE,
-    );
-    bencher.bench_local(|| {
-        BitBufferMut::collect_bool(prep.n, |i| {
-            let start = prep.offsets[i];
-            let end = prep.offsets[i + 1];
-            dfa.matches(&prep.all_bytes[start..end])
-        })
-    });
-}
-
-// ---------------------------------------------------------------------------
-// Decompress-only benchmarks (no search) — measures the raw cost of FSST
-// decompression for each dataset. Compare against DFA search on compressed
-// codes to see the speedup from avoiding decompression entirely.
-// ---------------------------------------------------------------------------
-
-/// Decompress all strings without searching. Measures pure decompression cost.
-#[inline(never)]
-fn run_decompress_only(
-    prep: &PreparedArray,
-    symbols: &[Symbol],
-    symbol_lengths: &[u8],
-    buf: &mut Vec<u8>,
-) {
-    for i in 0..prep.n {
-        let start = prep.offsets[i];
-        let end = prep.offsets[i + 1];
-        decompress_into(&prep.all_bytes[start..end], symbols, symbol_lengths, buf);
-        // Force the compiler not to optimize away the decompression.
-        std::hint::black_box(buf.len());
-    }
-}
-
-macro_rules! decompress_only_bench {
-    ($name:ident, $make_fn:ident, $bufsz:expr) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = $make_fn(N);
-            let prep = PreparedArray::from_fsst(&fsst);
-            let symbols = fsst.symbols();
-            let symbol_lengths = fsst.symbol_lengths();
-            let mut buf = Vec::with_capacity($bufsz);
-            bencher.bench_local(|| {
-                run_decompress_only(
-                    &prep,
-                    symbols.as_slice(),
-                    symbol_lengths.as_slice(),
-                    &mut buf,
-                );
-            });
-        }
-    };
-}
-
-decompress_only_bench!(urls_decompress_only, make_fsst_urls, 256);
-decompress_only_bench!(cb_decompress_only, make_fsst_clickbench_urls, 512);
-decompress_only_bench!(log_decompress_only, make_fsst_log_lines, 256);
-decompress_only_bench!(json_decompress_only, make_fsst_json_strings, 256);
-decompress_only_bench!(path_decompress_only, make_fsst_file_paths, 256);
-decompress_only_bench!(email_decompress_only, make_fsst_emails, 64);
-decompress_only_bench!(rare_decompress_only, make_fsst_rare_match, 128);
-
-// ---------------------------------------------------------------------------
-// Vortex array LIKE kernel benchmarks — end-to-end through the full vortex
-// execution framework. This measures the production code path including
-// array construction, kernel dispatch, and result materialization.
-// ---------------------------------------------------------------------------
+#![allow(clippy::unwrap_used)]
 
+use std::fmt;
 use std::sync::LazyLock;
 
+use divan::Bencher;
 use vortex_array::Canonical;
 use vortex_array::IntoArray;
 use vortex_array::VortexSessionExecute;
@@ -3549,102 +15,98 @@ use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
 use vortex_array::scalar_fn::fns::like::Like;
 use vortex_array::scalar_fn::fns::like::LikeOptions;
 use vortex_array::session::ArraySession;
+use vortex_fsst::FSSTArray;
+use vortex_fsst::test_utils::NUM_STRINGS;
+use vortex_fsst::test_utils::make_fsst_clickbench_urls;
+use vortex_fsst::test_utils::make_fsst_emails;
+use vortex_fsst::test_utils::make_fsst_file_paths;
+use vortex_fsst::test_utils::make_fsst_json_strings;
+use vortex_fsst::test_utils::make_fsst_log_lines;
+use vortex_fsst::test_utils::make_fsst_rare_match;
+use vortex_fsst::test_utils::make_fsst_short_urls;
 use vortex_session::VortexSession;
 
+fn main() {
+    divan::main();
+}
+
 static SESSION: LazyLock<VortexSession> =
     LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
 
-macro_rules! vortex_like_bench {
-    ($name:ident, $make_fn:ident, $pattern:expr) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = $make_fn(N);
-            let len = fsst.len();
-            let arr = fsst.into_array();
-            let pattern = ConstantArray::new($pattern, len).into_array();
-            bencher.bench_local(|| {
-                Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
-                    .unwrap()
-                    .into_array()
-                    .execute::<Canonical>(&mut SESSION.create_execution_ctx())
-                    .unwrap()
-            });
-        }
-    };
-}
+const N: usize = NUM_STRINGS;
+
+static FSST_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_short_urls(N));
+static FSST_CB_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_clickbench_urls(N));
+static FSST_LOG_LINES: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_log_lines(N));
+static FSST_JSON_STRINGS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_json_strings(N));
+static FSST_FILE_PATHS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_file_paths(N));
+static FSST_EMAILS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_emails(N));
+static FSST_RARE_MATCH: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_rare_match(N));
 
-vortex_like_bench!(vortex_like_urls, make_fsst_urls, "%google%");
-vortex_like_bench!(vortex_like_cb, make_fsst_clickbench_urls, "%yandex%");
-vortex_like_bench!(vortex_like_log, make_fsst_log_lines, "%Googlebot%");
-vortex_like_bench!(vortex_like_json, make_fsst_json_strings, "%enterprise%");
-vortex_like_bench!(vortex_like_path, make_fsst_file_paths, "%target/release%");
-vortex_like_bench!(vortex_like_email, make_fsst_emails, "%gmail%");
-vortex_like_bench!(vortex_like_rare, make_fsst_rare_match, "%xyzzy%");
+enum Dataset {
+    Urls,
+    Cb,
+    Log,
+    Json,
+    Path,
+    Email,
+    Rare,
+}
 
-// Arrow LIKE benchmarks: decompress FSST → canonical, then run Arrow's LIKE
-// (which uses memchr::memmem for %needle% patterns).
-macro_rules! arrow_like_bench {
-    ($name:ident, $make_fn:ident, $pattern:expr) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = $make_fn(N);
-            let len = fsst.len();
-            // Pre-decompress to canonical (VarBinViewArray)
-            let canonical = fsst
-                .into_array()
-                .execute::<Canonical>(&mut SESSION.create_execution_ctx())
-                .unwrap()
-                .into_array();
-            let pattern = ConstantArray::new($pattern, len).into_array();
-            bencher.bench_local(|| {
-                Like.try_new_array(
-                    len,
-                    LikeOptions::default(),
-                    [canonical.clone(), pattern.clone()],
-                )
-                .unwrap()
-                .into_array()
-                .execute::<Canonical>(&mut SESSION.create_execution_ctx())
-                .unwrap()
-            });
+impl fmt::Display for Dataset {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        match self {
+            Self::Urls => f.write_str("urls"),
+            Self::Cb => f.write_str("cb"),
+            Self::Log => f.write_str("log"),
+            Self::Json => f.write_str("json"),
+            Self::Path => f.write_str("path"),
+            Self::Email => f.write_str("email"),
+            Self::Rare => f.write_str("rare"),
         }
-    };
+    }
 }
 
-arrow_like_bench!(arrow_like_urls, make_fsst_urls, "%google%");
-arrow_like_bench!(arrow_like_cb, make_fsst_clickbench_urls, "%yandex%");
-arrow_like_bench!(arrow_like_log, make_fsst_log_lines, "%Googlebot%");
-arrow_like_bench!(arrow_like_json, make_fsst_json_strings, "%enterprise%");
-arrow_like_bench!(arrow_like_rare, make_fsst_rare_match, "%xyzzy%");
+impl Dataset {
+    fn fsst_array(&self) -> &'static FSSTArray {
+        match self {
+            Self::Urls => &FSST_URLS,
+            Self::Cb => &FSST_CB_URLS,
+            Self::Log => &FSST_LOG_LINES,
+            Self::Json => &FSST_JSON_STRINGS,
+            Self::Path => &FSST_FILE_PATHS,
+            Self::Email => &FSST_EMAILS,
+            Self::Rare => &FSST_RARE_MATCH,
+        }
+    }
 
-// End-to-end: decompress + arrow LIKE (measures total cost including decompression)
-macro_rules! e2e_arrow_like_bench {
-    ($name:ident, $make_fn:ident, $pattern:expr) => {
-        #[divan::bench]
-        fn $name(bencher: Bencher) {
-            let fsst = $make_fn(N);
-            let len = fsst.len();
-            let arr = fsst.into_array();
-            let pattern = ConstantArray::new($pattern, len).into_array();
-            bencher.bench_local(|| {
-                // Decompress inside the timed section
-                let canonical = arr
-                    .clone()
-                    .execute::<Canonical>(&mut SESSION.create_execution_ctx())
-                    .unwrap()
-                    .into_array();
-                Like.try_new_array(len, LikeOptions::default(), [canonical, pattern.clone()])
-                    .unwrap()
-                    .into_array()
-                    .execute::<Canonical>(&mut SESSION.create_execution_ctx())
-                    .unwrap()
-            });
+    fn pattern(&self) -> &'static str {
+        match self {
+            Self::Urls => "%google%",
+            Self::Cb => "%yandex%",
+            Self::Log => "%Googlebot%",
+            Self::Json => "%enterprise%",
+            Self::Path => "%target/release%",
+            Self::Email => "%gmail%",
+            Self::Rare => "%xyzzy%",
         }
-    };
+    }
 }
 
-e2e_arrow_like_bench!(e2e_arrow_urls, make_fsst_urls, "%google%");
-e2e_arrow_like_bench!(e2e_arrow_cb, make_fsst_clickbench_urls, "%yandex%");
-e2e_arrow_like_bench!(e2e_arrow_log, make_fsst_log_lines, "%Googlebot%");
-e2e_arrow_like_bench!(e2e_arrow_json, make_fsst_json_strings, "%enterprise%");
-e2e_arrow_like_bench!(e2e_arrow_rare, make_fsst_rare_match, "%xyzzy%");
+#[divan::bench(args = [
+    Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
+    Dataset::Path, Dataset::Email, Dataset::Rare,
+])]
+fn fsst_like(bencher: Bencher, dataset: &Dataset) {
+    let fsst = dataset.fsst_array();
+    let len = fsst.len();
+    let arr = fsst.clone().into_array();
+    let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
+    bencher.bench_local(|| {
+        Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
+            .unwrap()
+            .into_array()
+            .execute::<Canonical>(&mut SESSION.create_execution_ctx())
+            .unwrap()
+    });
+}
diff --git a/encodings/fsst/benches/fsst_url_compare.rs b/encodings/fsst/benches/fsst_url_compare.rs
index 57bcde80cc3..6dc3ddbe087 100644
--- a/encodings/fsst/benches/fsst_url_compare.rs
+++ b/encodings/fsst/benches/fsst_url_compare.rs
@@ -6,9 +6,6 @@
 use std::sync::LazyLock;
 
 use divan::Bencher;
-use rand::Rng;
-use rand::SeedableRng;
-use rand::rngs::StdRng;
 use vortex_array::IntoArray;
 use vortex_array::RecursiveCanonical;
 use vortex_array::VortexSessionExecute;
@@ -16,8 +13,6 @@ use vortex_array::arrays::ConstantArray;
 use vortex_array::arrays::VarBinArray;
 use vortex_array::builtins::ArrayBuiltins;
 use vortex_array::compute::warm_up_vtables;
-use vortex_array::dtype::DType;
-use vortex_array::dtype::Nullability;
 use vortex_array::expr::like;
 use vortex_array::expr::lit;
 use vortex_array::expr::root;
@@ -26,6 +21,10 @@ use vortex_array::scalar_fn::fns::operators::Operator;
 use vortex_array::session::ArraySession;
 use vortex_fsst::fsst_compress;
 use vortex_fsst::fsst_train_compressor;
+use vortex_fsst::test_utils::HIGH_MATCH_DOMAIN;
+use vortex_fsst::test_utils::LOW_MATCH_DOMAIN;
+use vortex_fsst::test_utils::NUM_STRINGS;
+use vortex_fsst::test_utils::generate_url_data;
 use vortex_session::VortexSession;
 
 fn main() {
@@ -36,76 +35,7 @@ fn main() {
 static SESSION: LazyLock<VortexSession> =
     LazyLock::new(|| VortexSession::empty().with::<ArraySession>());
 
-const NUM_URLS: usize = 100_000;
-
-/// A high-frequency domain that appears in ~50% of generated URLs.
-const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru";
-
-/// A low-frequency domain that appears in ~1% of generated URLs.
-const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com";
-
-// Domains modeled after real ClickBench URL distributions.
-const DOMAINS: &[(&str, u32)] = &[
-    ("smeshariki.ru", 500),          // ~50%
-    ("auto.ru", 150),                // ~15%
-    ("komme.ru", 100),               // ~10%
-    ("yandex.ru", 80),               //  ~8%
-    ("mail.ru", 60),                 //  ~6%
-    ("livejournal.com", 40),         //  ~4%
-    ("vk.com", 30),                  //  ~3%
-    ("avito.ru", 20),                //  ~2%
-    ("kinopoisk.ru", 10),            //   ~1%
-    ("rare-example-domain.com", 10), //  ~1%
-];
-
-const PATHS: &[&str] = &[
-    "/GameMain.aspx",
-    "/index.php",
-    "/catalog/item",
-    "/search",
-    "/news/article",
-    "/user/profile",
-    "/collection/view",
-    "/cars/used/sale",
-    "/forum/thread",
-    "/photo/album",
-    "/video/watch",
-    "/download/file",
-    "/api/v1/resource",
-    "/shop/product",
-    "/blog/post",
-];
-
-/// Generate 100k realistic ClickBench-style URLs.
-fn generate_url_data() -> VarBinArray {
-    let mut rng = StdRng::seed_from_u64(42);
-
-    // Build a weighted domain lookup.
-    let total_weight: u32 = DOMAINS.iter().map(|(_, w)| w).sum();
-    let urls: Vec<Option<Box<[u8]>>> = (0..NUM_URLS)
-        .map(|_| {
-            let domain_roll = rng.random_range(0..total_weight);
-            let mut cumulative = 0u32;
-            let mut domain = DOMAINS[0].0;
-            for &(d, w) in DOMAINS {
-                cumulative += w;
-                if domain_roll < cumulative {
-                    domain = d;
-                    break;
-                }
-            }
-
-            let path = PATHS[rng.random_range(0..PATHS.len())];
-            let query_id: u32 = rng.random_range(1..100_000);
-            let tab: u16 = rng.random_range(1..20);
-
-            let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}");
-            Some(url.into_bytes().into_boxed_slice())
-        })
-        .collect();
-
-    VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable))
-}
+const NUM_URLS: usize = NUM_STRINGS;
 
 static URL_DATA: LazyLock<VarBinArray> = LazyLock::new(generate_url_data);
 
diff --git a/encodings/fsst/src/test_utils.rs b/encodings/fsst/src/test_utils.rs
index fcf0d331c5e..b078229b7c1 100644
--- a/encodings/fsst/src/test_utils.rs
+++ b/encodings/fsst/src/test_utils.rs
@@ -16,6 +16,7 @@ use vortex_array::dtype::NativePType;
 use vortex_array::dtype::Nullability;
 use vortex_error::VortexExpect;
 
+use crate::FSSTArray;
 use crate::fsst_compress;
 use crate::fsst_train_compressor;
 
@@ -59,3 +60,527 @@ pub fn gen_dict_fsst_test_data<T: NativePType>(
     DictArray::try_new(codes.into_array(), values)
         .vortex_expect("DictArray::try_new should succeed for test data")
 }
+
+// ---------------------------------------------------------------------------
+// Benchmark dataset generators
+// ---------------------------------------------------------------------------
+
+pub const NUM_STRINGS: usize = 100_000;
+
+// ---------------------------------------------------------------------------
+// URL generator (ClickBench-style weighted domains)
+// ---------------------------------------------------------------------------
+
+pub const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru";
+pub const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com";
+
+pub const URL_DOMAINS: &[(&str, u32)] = &[
+    ("smeshariki.ru", 500),
+    ("auto.ru", 150),
+    ("komme.ru", 100),
+    ("yandex.ru", 80),
+    ("mail.ru", 60),
+    ("livejournal.com", 40),
+    ("vk.com", 30),
+    ("avito.ru", 20),
+    ("kinopoisk.ru", 10),
+    ("rare-example-domain.com", 10),
+];
+
+pub const URL_PATHS: &[&str] = &[
+    "/GameMain.aspx",
+    "/index.php",
+    "/catalog/item",
+    "/search",
+    "/news/article",
+    "/user/profile",
+    "/collection/view",
+    "/cars/used/sale",
+    "/forum/thread",
+    "/photo/album",
+    "/video/watch",
+    "/download/file",
+    "/api/v1/resource",
+    "/shop/product",
+    "/blog/post",
+];
+
+pub fn generate_url_data() -> VarBinArray {
+    generate_url_data_n(NUM_STRINGS)
+}
+
+pub fn generate_url_data_n(n: usize) -> VarBinArray {
+    let mut rng = StdRng::seed_from_u64(42);
+    let total_weight: u32 = URL_DOMAINS.iter().map(|(_, w)| w).sum();
+    let urls: Vec<Option<Box<[u8]>>> = (0..n)
+        .map(|_| {
+            let domain_roll = rng.random_range(0..total_weight);
+            let mut cumulative = 0u32;
+            let mut domain = URL_DOMAINS[0].0;
+            for &(d, w) in URL_DOMAINS {
+                cumulative += w;
+                if domain_roll < cumulative {
+                    domain = d;
+                    break;
+                }
+            }
+            let path = URL_PATHS[rng.random_range(0..URL_PATHS.len())];
+            let query_id: u32 = rng.random_range(1..100_000);
+            let tab: u16 = rng.random_range(1..20);
+            let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}");
+            Some(url.into_bytes().into_boxed_slice())
+        })
+        .collect();
+    VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable))
+}
+
+pub fn make_fsst_urls(n: usize) -> FSSTArray {
+    let varbin = generate_url_data_n(n);
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// ClickBench-style URL generator (longer URLs with query params, fragments)
+// ---------------------------------------------------------------------------
+
+const CB_DOMAINS: &[&str] = &[
+    "www.google.com",
+    "yandex.ru",
+    "mail.ru",
+    "vk.com",
+    "www.youtube.com",
+    "www.facebook.com",
+    "ok.ru",
+    "go.mail.ru",
+    "www.avito.ru",
+    "pogoda.yandex.ru",
+    "news.yandex.ru",
+    "maps.yandex.ru",
+    "market.yandex.ru",
+    "afisha.yandex.ru",
+    "auto.ru",
+    "www.kinopoisk.ru",
+    "www.ozon.ru",
+    "www.wildberries.ru",
+    "aliexpress.ru",
+    "lenta.ru",
+];
+
+const CB_PATHS: &[&str] = &[
+    "/search",
+    "/catalog/electronics/smartphones",
+    "/product/item/123456789",
+    "/news/2024/03/15/article-about-technology",
+    "/user/profile/settings/notifications",
+    "/api/v2/catalog/search",
+    "/checkout/cart/summary",
+    "/blog/2024/how-to-optimize-database-queries-for-better-performance",
+    "/category/home-and-garden/furniture/tables",
+    "/",
+];
+
+const CB_PARAMS: &[&str] = &[
+    "?utm_source=google&utm_medium=cpc&utm_campaign=spring_sale_2024&utm_content=banner_v2",
+    "?q=buy+smartphone+online+cheap+free+shipping&category=electronics&sort=price_asc&page=3",
+    "?ref=main_page_carousel_block_position_4&sessionid=abc123def456",
+    "?from=tabbar&clid=2270455&text=weather+forecast+tomorrow",
+    "?lr=213&msid=1234567890.12345&suggest_reqid=abcdef&csg=12345",
+    "",
+    "",
+    "",
+    "?page=1&per_page=20",
+    "?source=serp&forceshow=1",
+];
+
+const CB_FRAGMENTS: &[&str] = &[
+    "",
+    "",
+    "",
+    "#section-reviews",
+    "#comments",
+    "#price-history",
+    "",
+    "",
+    "",
+    "",
+];
+
+pub fn generate_clickbench_urls(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(123);
+    (0..n)
+        .map(|_| {
+            let scheme = if rng.random_bool(0.7) {
+                "https"
+            } else {
+                "http"
+            };
+            let domain = CB_DOMAINS[rng.random_range(0..CB_DOMAINS.len())];
+            let path = CB_PATHS[rng.random_range(0..CB_PATHS.len())];
+            let params = CB_PARAMS[rng.random_range(0..CB_PARAMS.len())];
+            let fragment = CB_FRAGMENTS[rng.random_range(0..CB_FRAGMENTS.len())];
+            format!("{scheme}://{domain}{path}{params}{fragment}")
+        })
+        .collect()
+}
+
+pub fn make_fsst_clickbench_urls(n: usize) -> FSSTArray {
+    let urls = generate_clickbench_urls(n);
+    let varbin = VarBinArray::from_iter(
+        urls.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// Short URL generator (simple URLs for contains benchmarks)
+// ---------------------------------------------------------------------------
+
+const SHORT_URL_DOMAINS: &[&str] = &[
+    "google.com",
+    "facebook.com",
+    "github.com",
+    "stackoverflow.com",
+    "amazon.com",
+    "reddit.com",
+    "twitter.com",
+    "youtube.com",
+    "wikipedia.org",
+    "microsoft.com",
+    "apple.com",
+    "netflix.com",
+    "linkedin.com",
+    "cloudflare.com",
+    "google.co.uk",
+    "docs.google.com",
+    "mail.google.com",
+    "maps.google.com",
+    "news.ycombinator.com",
+    "arxiv.org",
+];
+
+const SHORT_URL_PATHS: &[&str] = &[
+    "/index.html",
+    "/about",
+    "/search?q=vortex",
+    "/user/profile/settings",
+    "/api/v2/data",
+    "/blog/2024/post",
+    "/products/item/12345",
+    "/docs/reference/guide",
+    "/login",
+    "/dashboard/analytics",
+];
+
+pub fn generate_short_urls(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(42);
+    (0..n)
+        .map(|_| {
+            let scheme = if rng.random_bool(0.8) {
+                "https"
+            } else {
+                "http"
+            };
+            let domain = SHORT_URL_DOMAINS[rng.random_range(0..SHORT_URL_DOMAINS.len())];
+            let path = SHORT_URL_PATHS[rng.random_range(0..SHORT_URL_PATHS.len())];
+            format!("{scheme}://{domain}{path}")
+        })
+        .collect()
+}
+
+pub fn make_fsst_short_urls(n: usize) -> FSSTArray {
+    let urls = generate_short_urls(n);
+    let varbin = VarBinArray::from_iter(
+        urls.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// Log lines generator (Apache/nginx-style access logs)
+// ---------------------------------------------------------------------------
+
+const LOG_METHODS: &[&str] = &["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD"];
+const LOG_PATHS: &[&str] = &[
+    "/api/v1/users",
+    "/api/v2/products/search",
+    "/healthcheck",
+    "/static/js/app.bundle.min.js",
+    "/favicon.ico",
+    "/login",
+    "/dashboard/analytics",
+    "/api/v1/orders/12345/status",
+    "/graphql",
+    "/metrics",
+];
+const LOG_STATUS: &[u16] = &[
+    200, 200, 200, 200, 200, 201, 301, 302, 400, 403, 404, 500, 502,
+];
+const LOG_IPS: &[&str] = &[
+    "192.168.1.1",
+    "10.0.0.42",
+    "172.16.0.100",
+    "203.0.113.50",
+    "198.51.100.23",
+    "8.8.8.8",
+    "1.1.1.1",
+    "74.125.200.100",
+    "151.101.1.69",
+    "93.184.216.34",
+];
+const LOG_UAS: &[&str] = &[
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)",
+    "curl/7.81.0",
+    "python-requests/2.28.1",
+    "Go-http-client/1.1",
+    "Googlebot/2.1 (+http://www.google.com/bot.html)",
+];
+
+pub fn generate_log_lines(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(456);
+    (0..n)
+        .map(|_| {
+            let ip = LOG_IPS[rng.random_range(0..LOG_IPS.len())];
+            let method = LOG_METHODS[rng.random_range(0..LOG_METHODS.len())];
+            let path = LOG_PATHS[rng.random_range(0..LOG_PATHS.len())];
+            let status = LOG_STATUS[rng.random_range(0..LOG_STATUS.len())];
+            let size = rng.random_range(100..50000);
+            let ua = LOG_UAS[rng.random_range(0..LOG_UAS.len())];
+            format!(
+                r#"{ip} - - [15/Mar/2024:10:{:02}:{:02} +0000] "{method} {path} HTTP/1.1" {status} {size} "-" "{ua}""#,
+                rng.random_range(0..60u32),
+                rng.random_range(0..60u32),
+            )
+        })
+        .collect()
+}
+
+pub fn make_fsst_log_lines(n: usize) -> FSSTArray {
+    let lines = generate_log_lines(n);
+    let varbin = VarBinArray::from_iter(
+        lines.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// JSON strings generator (typical API response payloads)
+// ---------------------------------------------------------------------------
+
+const JSON_NAMES: &[&str] = &[
+    "Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack",
+];
+const JSON_CITIES: &[&str] = &[
+    "New York",
+    "London",
+    "Tokyo",
+    "Berlin",
+    "Sydney",
+    "Toronto",
+    "Paris",
+    "Mumbai",
+    "São Paulo",
+    "Seoul",
+];
+const JSON_TAGS: &[&str] = &[
+    "premium",
+    "verified",
+    "admin",
+    "moderator",
+    "subscriber",
+    "trial",
+    "enterprise",
+    "developer",
+];
+
+pub fn generate_json_strings(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(789);
+    (0..n)
+        .map(|_| {
+            let name = JSON_NAMES[rng.random_range(0..JSON_NAMES.len())];
+            let city = JSON_CITIES[rng.random_range(0..JSON_CITIES.len())];
+            let age = rng.random_range(18..80u32);
+            let tag1 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())];
+            let tag2 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())];
+            let id = rng.random_range(10000..99999u32);
+            format!(
+                r#"{{"id":{id},"name":"{name}","age":{age},"city":"{city}","tags":["{tag1}","{tag2}"],"active":true}}"#
+            )
+        })
+        .collect()
+}
+
+pub fn make_fsst_json_strings(n: usize) -> FSSTArray {
+    let jsons = generate_json_strings(n);
+    let varbin = VarBinArray::from_iter(
+        jsons.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// File paths generator (Unix-style paths with various depths)
+// ---------------------------------------------------------------------------
+
+const PATH_ROOTS: &[&str] = &[
+    "/home/user",
+    "/var/log",
+    "/etc",
+    "/usr/local/bin",
+    "/opt/app",
+    "/tmp",
+    "/srv/www",
+    "/data/warehouse",
+];
+const PATH_DIRS: &[&str] = &[
+    "src",
+    "build",
+    "dist",
+    "node_modules",
+    "target/release",
+    "config",
+    ".cache",
+    "logs/2024",
+    "backups/daily",
+    "migrations",
+];
+const PATH_FILES: &[&str] = &[
+    "main.rs",
+    "index.ts",
+    "config.yaml",
+    "Dockerfile",
+    "schema.sql",
+    "app.log",
+    "data.parquet",
+    "model.onnx",
+    "README.md",
+    "package.json",
+];
+
+pub fn generate_file_paths(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(321);
+    (0..n)
+        .map(|_| {
+            let root = PATH_ROOTS[rng.random_range(0..PATH_ROOTS.len())];
+            let dir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())];
+            let file = PATH_FILES[rng.random_range(0..PATH_FILES.len())];
+            let depth = rng.random_range(0..3u32);
+            let mut path = format!("{root}/{dir}");
+            for _ in 0..depth {
+                let subdir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())];
+                path.push('/');
+                path.push_str(subdir);
+            }
+            path.push('/');
+            path.push_str(file);
+            path
+        })
+        .collect()
+}
+
+pub fn make_fsst_file_paths(n: usize) -> FSSTArray {
+    let paths = generate_file_paths(n);
+    let varbin = VarBinArray::from_iter(
+        paths.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// Email addresses generator
+// ---------------------------------------------------------------------------
+
+const EMAIL_USERS: &[&str] = &[
+    "john.doe",
+    "jane.smith",
+    "admin",
+    "support",
+    "no-reply",
+    "sales.team",
+    "dev+test",
+    "marketing",
+    "info",
+    "contact.us",
+];
+const EMAIL_DOMAINS: &[&str] = &[
+    "gmail.com",
+    "yahoo.com",
+    "outlook.com",
+    "company.io",
+    "example.org",
+    "mail.ru",
+    "protonmail.com",
+    "fastmail.com",
+    "icloud.com",
+    "hey.com",
+];
+
+pub fn generate_emails(n: usize) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(654);
+    (0..n)
+        .map(|_| {
+            let user = EMAIL_USERS[rng.random_range(0..EMAIL_USERS.len())];
+            let domain = EMAIL_DOMAINS[rng.random_range(0..EMAIL_DOMAINS.len())];
+            let suffix = rng.random_range(0..1000u32);
+            format!("{user}{suffix}@{domain}")
+        })
+        .collect()
+}
+
+pub fn make_fsst_emails(n: usize) -> FSSTArray {
+    let emails = generate_emails(n);
+    let varbin = VarBinArray::from_iter(
+        emails.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}
+
+// ---------------------------------------------------------------------------
+// Rare match strings generator
+// ---------------------------------------------------------------------------
+
+pub const RARE_NEEDLE: &[u8] = b"xyzzy";
+
+pub fn generate_rare_match_strings(n: usize, match_rate: f64) -> Vec<String> {
+    let mut rng = StdRng::seed_from_u64(999);
+    let charset: &[u8] = b"abcdefghijklmnopqrstuvwABCDEFGHIJKLMNOPQRSTUVW0123456789-_.:/";
+    (0..n)
+        .map(|_| {
+            let len = rng.random_range(30..60);
+            let mut s: String = (0..len)
+                .map(|_| charset[rng.random_range(0..charset.len())] as char)
+                .collect();
+            if rng.random_bool(match_rate) {
+                let pos = rng.random_range(0..s.len().saturating_sub(RARE_NEEDLE.len()) + 1);
+                s.replace_range(
+                    pos..pos + RARE_NEEDLE.len().min(s.len() - pos),
+                    std::str::from_utf8(RARE_NEEDLE).unwrap(),
+                );
+            }
+            s
+        })
+        .collect()
+}
+
+pub fn make_fsst_rare_match(n: usize) -> FSSTArray {
+    let strings = generate_rare_match_strings(n, 0.00001);
+    let varbin = VarBinArray::from_iter(
+        strings.iter().map(|s| Some(s.as_str())),
+        DType::Utf8(Nullability::NonNullable),
+    );
+    let compressor = fsst_train_compressor(&varbin);
+    fsst_compress(varbin, &compressor)
+}