From dc7c5af932bdef45296cc4f1f6f7aed74c98dfda Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 12 Mar 2026 13:57:27 +0000
Subject: [PATCH 1/2] wip: fsst dedup

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.lock                                   |   2 -
 Cargo.toml                                   |   4 +-
 encodings/fsst/examples/fsst_symbol_table.rs |  59 ++
 encodings/fsst/src/array.rs                  |  76 ++
 vendor/fsst-rs/Cargo.toml                    |  55 +
 vendor/fsst-rs/LICENSE                       | 201 ++++
 vendor/fsst-rs/src/builder.rs                | 993 +++++++++++++++++++
 vendor/fsst-rs/src/lib.rs                    | 970 ++++++++++++++++++
 vendor/fsst-rs/src/lossy_pht.rs              | 128 +++
 9 files changed, 2485 insertions(+), 3 deletions(-)
 create mode 100644 encodings/fsst/examples/fsst_symbol_table.rs
 create mode 100644 vendor/fsst-rs/Cargo.toml
 create mode 100644 vendor/fsst-rs/LICENSE
 create mode 100644 vendor/fsst-rs/src/builder.rs
 create mode 100644 vendor/fsst-rs/src/lib.rs
 create mode 100644 vendor/fsst-rs/src/lossy_pht.rs

diff --git a/Cargo.lock b/Cargo.lock
index d380a3d6229..6954cf3cdb1 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3701,8 +3701,6 @@ dependencies = [
 [[package]]
 name = "fsst-rs"
 version = "0.5.6"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "561f2458a3407836ab8f1acc9113b8cda91b9d6378ba8dad13b2fe1a1d3af5ce"
 
 [[package]]
 name = "fst"
diff --git a/Cargo.toml b/Cargo.toml
index 0da5ee805ba..d8d111f0822 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -48,6 +48,8 @@ members = [
     "encodings/zigzag",
     "encodings/zstd",
     "encodings/bytebool",
+    # Vendored dependencies
+    "vendor/fsst-rs",
     # Benchmarks
     "benchmarks/lance-bench",
     "benchmarks/compress-bench",
@@ -142,7 +144,7 @@ enum-iterator = "2.0.0"
 env_logger = "0.11"
 fastlanes = "0.5"
 flatbuffers = "25.2.10"
-fsst-rs = "0.5.5"
+fsst-rs = { path = "vendor/fsst-rs" }
 futures = { version = "0.3.31", default-features = false }
 fuzzy-matcher = "0.3"
 get_dir = "0.5.0"
diff --git a/encodings/fsst/examples/fsst_symbol_table.rs b/encodings/fsst/examples/fsst_symbol_table.rs
new file mode 100644
index 00000000000..c1cad6e3b10
--- /dev/null
+++ b/encodings/fsst/examples/fsst_symbol_table.rs
@@ -0,0 +1,59 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright the Vortex contributors
+
+//! Reads lines from stdin, trains an FSST symbol table, and prints it.
+//!
+//! Usage:
+//!   cat urls.txt | cargo run -p vortex-fsst --example fsst_symbol_table
+//!   duckdb -csv -noheader -c "SELECT URL FROM 'hits_0.parquet' LIMIT 100000" | cargo run ...
+
+#![allow(clippy::expect_used)]
+
+use std::io;
+use std::io::BufRead;
+
+use vortex_array::arrays::VarBinArray;
+use vortex_array::dtype::DType;
+use vortex_array::dtype::Nullability;
+use vortex_fsst::fsst_compress;
+use vortex_fsst::fsst_train_compressor;
+
+fn main() {
+    let stdin = io::stdin();
+    let lines: Vec<Option<Box<[u8]>>> = stdin
+        .lock()
+        .lines()
+        .map(|l| {
+            l.expect("failed to read line")
+                .into_bytes()
+                .into_boxed_slice()
+        })
+        .map(Some)
+        .collect();
+
+    let n = lines.len();
+    eprintln!("Read {n} lines from stdin");
+
+    let varbin = VarBinArray::from_iter(lines, DType::Utf8(Nullability::NonNullable));
+    let compressor = fsst_train_compressor(&varbin);
+    let fsst_array = fsst_compress(&varbin, &compressor);
+
+    print!("{}", fsst_array.format_symbol_table());
+
+    // Report duplicate symbols in the table.
+    let symbols = compressor.symbol_table();
+    let lengths = compressor.symbol_lengths();
+    let total = symbols.len();
+    let mut keys: Vec<(u64, u8)> = symbols
+        .iter()
+        .zip(lengths.iter())
+        .map(|(sym, &len)| (sym.to_u64(), len))
+        .collect();
+    keys.sort();
+    let unique_count = {
+        keys.dedup();
+        keys.len()
+    };
+    let duplicates = total - unique_count;
+    eprintln!("Symbol table: {total} symbols, {duplicates} duplicates");
+}
diff --git a/encodings/fsst/src/array.rs b/encodings/fsst/src/array.rs
index 2f10b0b1a47..4ee89b73ee8 100644
--- a/encodings/fsst/src/array.rs
+++ b/encodings/fsst/src/array.rs
@@ -3,6 +3,7 @@
 
 use std::fmt::Debug;
 use std::fmt::Formatter;
+use std::fmt::Write;
 use std::hash::Hash;
 use std::sync::Arc;
 use std::sync::LazyLock;
@@ -514,6 +515,46 @@ impl FSSTArray {
     pub fn compressor(&self) -> &Compressor {
         self.compressor.as_ref()
     }
+
+    /// Format the FSST symbol table as a human-readable table.
+    ///
+    /// Each row shows the symbol code, byte length, hex bytes, and a text representation
+    /// where printable ASCII is shown as-is and other bytes are escaped.
+    pub fn format_symbol_table(&self) -> String {
+        fn escape_byte(b: u8) -> String {
+            match b {
+                b'\n' => "\\n".to_string(),
+                b'\t' => "\\t".to_string(),
+                b'\r' => "\\r".to_string(),
+                b if b.is_ascii_graphic() || b == b' ' => (b as char).to_string(),
+                b => format!("\\x{b:02x}"),
+            }
+        }
+
+        let symbols = self.symbols.as_slice();
+        let lengths = self.symbol_lengths.as_slice();
+        let n = symbols.len();
+
+        let mut out = format!("FSST Symbol Table ({n} symbols)\n");
+        let _ = writeln!(out, "{:>4} | {:>3} | {:<23} | Text", "Code", "Len", "Hex");
+        let _ = writeln!(out, "-----+-----+-------------------------+------");
+
+        for (code, (sym, &len)) in symbols.iter().zip(lengths.iter()).enumerate() {
+            let bytes = sym.to_u64().to_le_bytes();
+            let actual = &bytes[..len as usize];
+
+            let hex: String = actual
+                .iter()
+                .map(|b| format!("{b:02x}"))
+                .collect::<Vec<_>>()
+                .join(" ");
+
+            let text: String = actual.iter().map(|&b| escape_byte(b)).collect();
+
+            let _ = writeln!(out, "{code:>4} | {len:>3} | {hex:<23} | {text}");
+        }
+        out
+    }
 }
 
 impl ValidityChild<FSSTVTable> for FSSTVTable {
@@ -544,7 +585,9 @@ mod test {
 
     use crate::FSSTVTable;
     use crate::array::FSSTMetadata;
+    use crate::fsst_compress;
     use crate::fsst_compress_iter;
+    use crate::fsst_train_compressor;
 
     #[cfg_attr(miri, ignore)]
     #[test]
@@ -628,4 +671,37 @@ mod test {
             })
             .unwrap()
     }
+
+    #[test]
+    fn test_format_symbol_table_urls() {
+        use vortex_array::arrays::VarBinArray;
+
+        // Clickbench-style URL data
+        let urls: Vec<Option<&[u8]>> = vec![
+            Some(b"http://smeshariki.ru/GameMain.aspx?id=123&tab=1#ref=123"),
+            Some(b"http://smeshariki.ru/index.php?id=456&tab=2#ref=456"),
+            Some(b"http://auto.ru/cars/used/sale?id=789&tab=3#ref=789"),
+            Some(b"http://komme.ru/search?id=1000&tab=4#ref=1000"),
+            Some(b"http://yandex.ru/news/article?id=2000&tab=5#ref=2000"),
+            Some(b"http://mail.ru/user/profile?id=3000&tab=6#ref=3000"),
+            Some(b"http://smeshariki.ru/catalog/item?id=4000&tab=7#ref=4000"),
+            Some(b"http://auto.ru/forum/thread?id=5000&tab=8#ref=5000"),
+            Some(b"http://vk.com/photo/album?id=6000&tab=9#ref=6000"),
+            Some(b"http://livejournal.com/blog/post?id=7000&tab=10#ref=7000"),
+        ];
+
+        let varbin = VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable));
+        let compressor = fsst_train_compressor(&varbin);
+        let fsst_array = fsst_compress(&varbin, &compressor);
+
+        let table = fsst_array.format_symbol_table();
+        eprintln!("{table}");
+
+        // Verify basic structure
+        assert!(table.starts_with("FSST Symbol Table ("));
+        assert!(table.contains("Code"));
+        assert!(table.contains("Len"));
+        assert!(table.contains("Hex"));
+        assert!(table.contains("Text"));
+    }
 }
diff --git a/vendor/fsst-rs/Cargo.toml b/vendor/fsst-rs/Cargo.toml
new file mode 100644
index 00000000000..666a2166c7a
--- /dev/null
+++ b/vendor/fsst-rs/Cargo.toml
@@ -0,0 +1,55 @@
+[package]
+name = "fsst-rs"
+version = "0.5.6"
+edition = "2024"
+rust-version = "1.86.0"
+authors = ["SpiralDB Developers <hello@spiraldb.com>"]
+description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"
+license = "Apache-2.0"
+repository = "https://github.com/spiraldb/fsst"
+readme = "README.md"
+keywords = ["compression", "fsst"]
+categories = ["compression"]
+
+[lib]
+name = "fsst"
+path = "src/lib.rs"
+
+[lints.clippy]
+or_fun_call = "deny"
+
+[lints.clippy.all]
+level = "deny"
+priority = -1
+
+[lints.clippy.if_then_some_else_none]
+level = "deny"
+priority = 0
+
+[lints.clippy.mem_forget]
+level = "deny"
+priority = 0
+
+[lints.clippy.panic_in_result_fn]
+level = "deny"
+priority = 0
+
+[lints.clippy.same_name_method]
+level = "deny"
+priority = 0
+
+[lints.clippy.tests_outside_test_module]
+level = "deny"
+priority = 0
+
+[lints.clippy.unwrap_in_result]
+level = "deny"
+priority = 0
+
+[lints.clippy.use_debug]
+level = "deny"
+priority = 0
+
+[lints.rust]
+missing_docs = "deny"
+warnings = "deny"
diff --git a/vendor/fsst-rs/LICENSE b/vendor/fsst-rs/LICENSE
new file mode 100644
index 00000000000..261eeb9e9f8
--- /dev/null
+++ b/vendor/fsst-rs/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/vendor/fsst-rs/src/builder.rs b/vendor/fsst-rs/src/builder.rs
new file mode 100644
index 00000000000..f2b911c8b5b
--- /dev/null
+++ b/vendor/fsst-rs/src/builder.rs
@@ -0,0 +1,993 @@
+//! Functions and types used for building a [`Compressor`] from a corpus of text.
+//!
+//! This module implements the logic from Algorithm 3 of the [FSST Paper].
+//!
+//! [FSST Paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
+
+use std::cmp::Ordering;
+use std::collections::BinaryHeap;
+
+use crate::Code;
+use crate::Compressor;
+use crate::FSST_CODE_BASE;
+use crate::FSST_CODE_MASK;
+use crate::Symbol;
+use crate::advance_8byte_word;
+use crate::compare_masked;
+use crate::lossy_pht::LossyPHT;
+
+/// Bitmap that only works for values up to 512
+#[derive(Clone, Copy, Debug, Default)]
+struct CodesBitmap {
+    codes: [u64; 8],
+}
+
+assert_sizeof!(CodesBitmap => 64);
+
+impl CodesBitmap {
+    /// Set the indicated bit. Must be between 0 and [`FSST_CODE_MASK`][crate::FSST_CODE_MASK].
+    pub(crate) fn set(&mut self, index: usize) {
+        debug_assert!(
+            index <= FSST_CODE_MASK as usize,
+            "code cannot exceed {FSST_CODE_MASK}"
+        );
+
+        let map = index >> 6;
+        self.codes[map] |= 1 << (index % 64);
+    }
+
+    /// Check if `index` is present in the bitmap
+    pub(crate) fn is_set(&self, index: usize) -> bool {
+        debug_assert!(
+            index <= FSST_CODE_MASK as usize,
+            "code cannot exceed {FSST_CODE_MASK}"
+        );
+
+        let map = index >> 6;
+        self.codes[map] & (1 << (index % 64)) != 0
+    }
+
+    /// Get all codes set in this bitmap
+    pub(crate) fn codes(&self) -> CodesIterator<'_> {
+        CodesIterator {
+            inner: self,
+            index: 0,
+            block: self.codes[0],
+            reference: 0,
+        }
+    }
+
+    /// Clear the bitmap of all entries.
+    pub(crate) fn clear(&mut self) {
+        self.codes[0] = 0;
+        self.codes[1] = 0;
+        self.codes[2] = 0;
+        self.codes[3] = 0;
+        self.codes[4] = 0;
+        self.codes[5] = 0;
+        self.codes[6] = 0;
+        self.codes[7] = 0;
+    }
+}
+
+struct CodesIterator<'a> {
+    inner: &'a CodesBitmap,
+    index: usize,
+    block: u64,
+    reference: usize,
+}
+
+impl Iterator for CodesIterator<'_> {
+    type Item = u16;
+
+    fn next(&mut self) -> Option<Self::Item> {
+        // If current is zero, advance to next non-zero block
+        while self.block == 0 {
+            self.index += 1;
+            if self.index >= 8 {
+                return None;
+            }
+            self.block = self.inner.codes[self.index];
+            self.reference = self.index * 64;
+        }
+
+        // Find the next set bit in the current block.
+        let position = self.block.trailing_zeros() as usize;
+        let code = self.reference + position;
+
+        if code >= 511 {
+            return None;
+        }
+
+        // The next iteration will calculate with reference to the returned code + 1
+        self.reference = code + 1;
+        self.block = if position == 63 {
+            0
+        } else {
+            self.block >> (1 + position)
+        };
+
+        Some(code as u16)
+    }
+}
+
+#[derive(Debug, Clone)]
+struct Counter {
+    /// Frequency count for each code.
+    counts1: Vec<usize>,
+
+    /// Frequency count for each code-pair.
+    counts2: Vec<usize>,
+
+    /// Bitmap index for codes that appear in counts1
+    code1_index: CodesBitmap,
+
+    /// Bitmap index of pairs that have been set.
+    ///
+    /// `pair_index[code1].codes()` yields an iterator that can
+    /// be used to find all possible codes that follow `codes1`.
+    pair_index: Vec<CodesBitmap>,
+}
+
+const COUNTS1_SIZE: usize = (FSST_CODE_MASK + 1) as usize;
+
+// NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector,
+//  because `vec!` has a specialization for zero.
+//
+// We also include +1 extra row at the end so that we can do writes into the counters without a branch
+// for the first iteration.
+const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE;
+
+impl Counter {
+    fn new() -> Self {
+        let mut counts1 = Vec::with_capacity(COUNTS1_SIZE);
+        let mut counts2 = Vec::with_capacity(COUNTS2_SIZE);
+        // SAFETY: all accesses to the vector go through the bitmap to ensure no uninitialized
+        //  data is ever read from these vectors.
+        unsafe {
+            counts1.set_len(COUNTS1_SIZE);
+            counts2.set_len(COUNTS2_SIZE);
+        }
+
+        Self {
+            counts1,
+            counts2,
+            code1_index: CodesBitmap::default(),
+            pair_index: vec![CodesBitmap::default(); COUNTS1_SIZE],
+        }
+    }
+
+    #[inline]
+    fn record_count1(&mut self, code1: u16) {
+        // If not set, we want to start at one.
+        let base = if self.code1_index.is_set(code1 as usize) {
+            self.counts1[code1 as usize]
+        } else {
+            0
+        };
+
+        self.counts1[code1 as usize] = base + 1;
+        self.code1_index.set(code1 as usize);
+    }
+
+    #[inline]
+    fn record_count2(&mut self, code1: u16, code2: u16) {
+        debug_assert!(code1 == FSST_CODE_MASK || self.code1_index.is_set(code1 as usize));
+        debug_assert!(self.code1_index.is_set(code2 as usize));
+
+        let idx = (code1 as usize) * COUNTS1_SIZE + (code2 as usize);
+        if self.pair_index[code1 as usize].is_set(code2 as usize) {
+            self.counts2[idx] += 1;
+        } else {
+            self.counts2[idx] = 1;
+        }
+        self.pair_index[code1 as usize].set(code2 as usize);
+    }
+
+    #[inline]
+    fn count1(&self, code1: u16) -> usize {
+        debug_assert!(self.code1_index.is_set(code1 as usize));
+
+        self.counts1[code1 as usize]
+    }
+
+    #[inline]
+    fn count2(&self, code1: u16, code2: u16) -> usize {
+        debug_assert!(self.code1_index.is_set(code1 as usize));
+        debug_assert!(self.code1_index.is_set(code2 as usize));
+        debug_assert!(self.pair_index[code1 as usize].is_set(code2 as usize));
+
+        let idx = (code1 as usize) * 512 + (code2 as usize);
+        self.counts2[idx]
+    }
+
+    /// Returns an ordered iterator over the codes that were observed
+    /// in a call to [`Self::count1`].
+    fn first_codes(&self) -> CodesIterator<'_> {
+        self.code1_index.codes()
+    }
+
+    /// Returns an iterator over the codes that have been observed
+    /// to follow `code1`.
+    ///
+    /// This is the set of all values `code2` where there was
+    /// previously a call to `self.record_count2(code1, code2)`.
+    fn second_codes(&self, code1: u16) -> CodesIterator<'_> {
+        self.pair_index[code1 as usize].codes()
+    }
+
+    /// Clear the counters.
+    /// Note that this just touches the bitmaps and sets them all to invalid.
+    fn clear(&mut self) {
+        self.code1_index.clear();
+        for index in &mut self.pair_index {
+            index.clear();
+        }
+    }
+}
+
+/// Entrypoint for building a new `Compressor`.
+pub struct CompressorBuilder {
+    /// Table mapping codes to symbols.
+    ///
+    /// The entries 0-255 are setup in some other way here
+    symbols: Vec<Symbol>,
+
+    /// The number of entries in the symbol table that have been populated, not counting
+    /// the escape values.
+    n_symbols: u8,
+
+    /// Counts for number of symbols of each length.
+    ///
+    /// `len_histogram[len-1]` = count of the symbols of length `len`.
+    len_histogram: [u8; 8],
+
+    /// Inverted index mapping 1-byte symbols to codes.
+    ///
+    /// This is only used for building, not used by the final `Compressor`.
+    codes_one_byte: Vec<Code>,
+
+    /// Inverted index mapping 2-byte symbols to codes
+    codes_two_byte: Vec<Code>,
+
+    /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more
+    lossy_pht: LossyPHT,
+}
+
+impl CompressorBuilder {
+    /// Create a new builder.
+    pub fn new() -> Self {
+        // NOTE: `vec!` has a specialization for building a new vector of `0u64`. Because Symbol and u64
+        //  have the same bit pattern, we can allocate as u64 and transmute. If we do `vec![Symbol::EMPTY; N]`,
+        // that will create a new Vec and call `Symbol::EMPTY.clone()` `N` times which is considerably slower.
+        let symbols = vec![0u64; 511];
+
+        // SAFETY: transmute safety assured by the compiler.
+        let symbols: Vec<Symbol> = unsafe { std::mem::transmute(symbols) };
+
+        let mut table = Self {
+            symbols,
+            n_symbols: 0,
+            len_histogram: [0; 8],
+            codes_two_byte: Vec::with_capacity(65_536),
+            codes_one_byte: Vec::with_capacity(512),
+            lossy_pht: LossyPHT::new(),
+        };
+
+        // Populate the escape byte entries.
+        for byte in 0..=255 {
+            let symbol = Symbol::from_u8(byte);
+            table.symbols[byte as usize] = symbol;
+        }
+
+        // Fill codes_one_byte with pseudocodes for each byte.
+        for byte in 0..=255 {
+            // Push pseudocode for single-byte escape.
+            table.codes_one_byte.push(Code::new_escape(byte));
+        }
+
+        // Fill codes_two_byte with pseudocode of first byte
+        for idx in 0..=65_535 {
+            table.codes_two_byte.push(Code::new_escape(idx as u8));
+        }
+
+        table
+    }
+}
+
+impl Default for CompressorBuilder {
+    fn default() -> Self {
+        Self::new()
+    }
+}
+
+impl CompressorBuilder {
+    /// Attempt to insert a new symbol at the end of the table.
+    ///
+    /// # Panics
+    ///
+    /// Panics if the table is already full.
+    ///
+    /// # Returns
+    ///
+    /// Returns true if the symbol was inserted successfully, or false if it conflicted
+    /// with an existing symbol.
+    pub fn insert(&mut self, symbol: Symbol, len: usize) -> bool {
+        assert!(self.n_symbols < 255, "cannot insert into full symbol table");
+        assert_eq!(len, symbol.len(), "provided len must equal symbol.len()");
+
+        if len == 2 {
+            // Check if this 2-byte symbol is already in the table.
+            if self.codes_two_byte[symbol.first2() as usize].extended_code() >= FSST_CODE_BASE {
+                return false;
+            }
+            self.codes_two_byte[symbol.first2() as usize] =
+                Code::new_symbol_building(self.n_symbols, 2);
+        } else if len == 1 {
+            // Check if this 1-byte symbol is already in the table.
+            if self.codes_one_byte[symbol.first_byte() as usize].extended_code() >= FSST_CODE_BASE {
+                return false;
+            }
+            self.codes_one_byte[symbol.first_byte() as usize] =
+                Code::new_symbol_building(self.n_symbols, 1);
+        } else {
+            // Symbols of 3 or more bytes go into the hash table
+            if !self.lossy_pht.insert(symbol, len, self.n_symbols) {
+                return false;
+            }
+        }
+
+        // Increment length histogram.
+        self.len_histogram[len - 1] += 1;
+
+        // Insert successfully stored symbol at end of the symbol table
+        // Note the rescaling from range [0-254] -> [256, 510].
+        self.symbols[256 + (self.n_symbols as usize)] = symbol;
+        self.n_symbols += 1;
+        true
+    }
+
+    /// Clear all set items from the compressor.
+    ///
+    /// This is considerably faster than building a new Compressor from scratch for each
+    /// iteration of the `train` loop.
+    fn clear(&mut self) {
+        // Eliminate every observed code from the table.
+        for code in 0..(256 + self.n_symbols as usize) {
+            let symbol = self.symbols[code];
+            if symbol.len() == 1 {
+                // Reset the entry from the codes_one_byte array.
+                self.codes_one_byte[symbol.first_byte() as usize] =
+                    Code::new_escape(symbol.first_byte());
+            } else if symbol.len() == 2 {
+                // Reset the entry from the codes_two_byte array.
+                self.codes_two_byte[symbol.first2() as usize] =
+                    Code::new_escape(symbol.first_byte());
+            } else {
+                // Clear the hashtable entry
+                self.lossy_pht.remove(symbol);
+            }
+        }
+
+        // Reset len histogram
+        for i in 0..=7 {
+            self.len_histogram[i] = 0;
+        }
+
+        self.n_symbols = 0;
+    }
+
+    /// Finalizing the table is done once building is complete to prepare for efficient
+    /// compression.
+    ///
+    /// When we finalize the table, the following modifications are made in-place:
+    ///
+    /// 1. The codes are renumbered so that all symbols are ordered by length (order 23456781).
+    ///    During this process, the two byte symbols are separated into a byte_lim and a suffix_lim,
+    ///    so we know that we don't need to check the suffix limitations instead.
+    /// 2. The 1-byte symbols index is merged into the 2-byte symbols index to allow for use of only
+    ///    a single index in front of the hash table.
+    ///
+    /// # Returns
+    ///
+    /// Returns the `suffix_lim`, which is the index of the two-byte code before where we know
+    /// there are no longer suffixies in the symbol table.
+    ///
+    /// Also returns the lengths vector, which is of length `n_symbols` and contains the
+    /// length for each of the values.
+    fn finalize(&mut self) -> (u8, Vec<u8>) {
+        // Create a cumulative sum of each of the elements of the input line numbers.
+        // Do a map that includes the previously seen value as well.
+        // Regroup symbols based on their lengths.
+        // Space at the end of the symbol table reserved for the one-byte codes.
+        let byte_lim = self.n_symbols - self.len_histogram[0];
+
+        // Start code for each length.
+        // Length 1: at the end of symbol table.
+        // Length 2: starts at 0. Split into before/after suffixLim.
+        let mut codes_by_length = [0u8; 8];
+        codes_by_length[0] = byte_lim;
+        codes_by_length[1] = 0;
+
+        // codes for lengths 3..=8 start where the previous ones end.
+        for i in 1..7 {
+            codes_by_length[i + 1] = codes_by_length[i] + self.len_histogram[i];
+        }
+
+        // no_suffix_code is the lowest code for a symbol that does not have a longer 3+ byte
+        // suffix in the table.
+        // This value starts at 0 and extends up.
+        let mut no_suffix_code = 0;
+
+        // The codes that do not have a suffix begin just before the range of the 3-byte codes.
+        let mut has_suffix_code = codes_by_length[2];
+
+        // Assign each symbol a new code ordered by lengths, in the order
+        // 2(no suffix) | 2 (suffix) | 3 | 4 | 5 | 6 | 7 | 8 | 1
+        let mut new_codes = [0u8; FSST_CODE_BASE as usize];
+
+        let mut symbol_lens = [0u8; FSST_CODE_BASE as usize];
+
+        for i in 0..(self.n_symbols as usize) {
+            let symbol = self.symbols[256 + i];
+            let len = symbol.len();
+            if len == 2 {
+                let has_suffix = self
+                    .symbols
+                    .iter()
+                    .skip(FSST_CODE_BASE as usize)
+                    .enumerate()
+                    .any(|(k, other)| i != k && symbol.first2() == other.first2());
+
+                if has_suffix {
+                    // Symbols that have a longer suffix are inserted at the end of the 2-byte range
+                    has_suffix_code -= 1;
+                    new_codes[i] = has_suffix_code;
+                } else {
+                    // Symbols that do not have a longer suffix are inserted at the start of
+                    // the 2-byte range.
+                    new_codes[i] = no_suffix_code;
+                    no_suffix_code += 1;
+                }
+            } else {
+                // Assign new code based on the next code available for the given length symbol
+                new_codes[i] = codes_by_length[len - 1];
+                codes_by_length[len - 1] += 1;
+            }
+
+            // Write the symbol into the front half of the symbol table.
+            // We are reusing the space that was previously occupied by escapes.
+            self.symbols[new_codes[i] as usize] = symbol;
+            symbol_lens[new_codes[i] as usize] = len as u8;
+        }
+
+        // Truncate the symbol table to only include the "true" symbols.
+        self.symbols.truncate(self.n_symbols as usize);
+
+        // Rewrite the codes_one_byte table to point at the new code values.
+        // Replace pseudocodes with escapes.
+        for byte in 0..=255 {
+            let one_byte = self.codes_one_byte[byte];
+            if one_byte.extended_code() >= FSST_CODE_BASE {
+                let new_code = new_codes[one_byte.code() as usize];
+                self.codes_one_byte[byte] = Code::new_symbol(new_code, 1);
+            } else {
+                // After finalize: codes_one_byte contains the unused value
+                self.codes_one_byte[byte] = Code::UNUSED;
+            }
+        }
+
+        // Rewrite the codes_two_byte table to point at the new code values.
+        // Replace pseudocodes with escapes.
+        for two_bytes in 0..=65_535 {
+            let two_byte = self.codes_two_byte[two_bytes];
+            if two_byte.extended_code() >= FSST_CODE_BASE {
+                let new_code = new_codes[two_byte.code() as usize];
+                self.codes_two_byte[two_bytes] = Code::new_symbol(new_code, 2);
+            } else {
+                // The one-byte code for the given code number here...
+                self.codes_two_byte[two_bytes] = self.codes_one_byte[two_bytes & 0xFF];
+            }
+        }
+
+        // Reset values in the hash table as well.
+        self.lossy_pht.renumber(&new_codes);
+
+        // Pre-compute the lengths
+        let mut lengths = Vec::with_capacity(self.n_symbols as usize);
+        for symbol in &self.symbols {
+            lengths.push(symbol.len() as u8);
+        }
+
+        (has_suffix_code, lengths)
+    }
+
+    /// Build into the final hash table.
+    pub fn build(mut self) -> Compressor {
+        // finalize the symbol table by inserting the codes_twobyte values into
+        // the relevant parts of the `codes_onebyte` set.
+
+        let (has_suffix_code, lengths) = self.finalize();
+
+        Compressor {
+            symbols: self.symbols,
+            lengths,
+            n_symbols: self.n_symbols,
+            has_suffix_code,
+            codes_two_byte: self.codes_two_byte,
+            lossy_pht: self.lossy_pht,
+        }
+    }
+}
+
+/// The number of generations used for training. This is taken from the [FSST paper].
+///
+/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
+#[cfg(not(miri))]
+const GENERATIONS: [usize; 5] = [8usize, 38, 68, 98, 128];
+#[cfg(miri)]
+const GENERATIONS: [usize; 3] = [8usize, 38, 128];
+
+const FSST_SAMPLETARGET: usize = 1 << 14;
+const FSST_SAMPLEMAX: usize = 1 << 15;
+const FSST_SAMPLELINE: usize = 512;
+
+/// Create a sample from a set of strings in the input.
+///
+/// Sample is constructing by copying "chunks" from the `str_in`s into the `sample_buf`, the
+/// returned slices are pointers into the `sample_buf`.
+///
+/// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures.
+#[allow(clippy::ptr_arg)]
+fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec<u8>, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> {
+    assert!(
+        sample_buf.capacity() >= FSST_SAMPLEMAX,
+        "sample_buf.len() < FSST_SAMPLEMAX"
+    );
+
+    let mut sample: Vec<&[u8]> = Vec::new();
+
+    let tot_size: usize = str_in.iter().map(|s| s.len()).sum();
+    if tot_size < FSST_SAMPLETARGET {
+        return str_in.clone();
+    }
+
+    let mut sample_rnd = fsst_hash(4637947);
+    let sample_lim = FSST_SAMPLETARGET;
+    let mut sample_buf_offset: usize = 0;
+
+    while sample_buf_offset < sample_lim {
+        sample_rnd = fsst_hash(sample_rnd);
+        let line_nr = (sample_rnd as usize) % str_in.len();
+
+        // Find the first non-empty chunk starting at line_nr, wrapping around if
+        // necessary.
+        let Some(line) = (line_nr..str_in.len())
+            .chain(0..line_nr)
+            .map(|line_nr| str_in[line_nr])
+            .find(|line| !line.is_empty())
+        else {
+            return sample;
+        };
+
+        let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE);
+        sample_rnd = fsst_hash(sample_rnd);
+        let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks);
+
+        let len = FSST_SAMPLELINE.min(line.len() - chunk);
+
+        sample_buf.extend_from_slice(&line[chunk..chunk + len]);
+
+        // SAFETY: this is the data we just placed into `sample_buf` in the line above.
+        let slice =
+            unsafe { std::slice::from_raw_parts(sample_buf.as_ptr().add(sample_buf_offset), len) };
+
+        sample.push(slice);
+
+        sample_buf_offset += len;
+    }
+
+    sample
+}
+
+/// Hash function used in various components of the library.
+///
+/// This is equivalent to the FSST_HASH macro from the C++ implementation.
+#[inline]
+pub(crate) fn fsst_hash(value: u64) -> u64 {
+    value.wrapping_mul(2971215073) ^ value.wrapping_shr(15)
+}
+
+impl Compressor {
+    /// Build and train a `Compressor` from a sample corpus of text.
+    ///
+    /// This function implements the generational algorithm described in the [FSST paper] Section
+    /// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts
+    /// to merge symbols when doing so would yield better compression than leaving them unmerged. The
+    /// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape
+    /// code).
+    ///
+    /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf
+    pub fn train(values: &Vec<&[u8]>) -> Self {
+        let mut builder = CompressorBuilder::new();
+
+        if values.is_empty() {
+            return builder.build();
+        }
+
+        let mut counters = Counter::new();
+        let mut sample_memory = Vec::with_capacity(FSST_SAMPLEMAX);
+        let mut pqueue = BinaryHeap::with_capacity(65_536);
+
+        let sample = make_sample(&mut sample_memory, values);
+        for sample_frac in GENERATIONS {
+            for (i, line) in sample.iter().enumerate() {
+                if sample_frac < 128 && ((fsst_hash(i as u64) & 127) as usize) > sample_frac {
+                    continue;
+                }
+
+                builder.compress_count(line, &mut counters);
+            }
+
+            // Clear the heap before we use it again
+            pqueue.clear();
+            builder.optimize(&counters, sample_frac, &mut pqueue);
+            counters.clear();
+        }
+
+        builder.build()
+    }
+}
+
+impl CompressorBuilder {
+    /// Find the longest symbol using the hash table and the codes_one_byte and codes_two_byte indexes.
+    fn find_longest_symbol(&self, word: u64) -> Code {
+        // Probe the hash table first to see if we have a long match
+        let entry = self.lossy_pht.lookup(word);
+        let ignored_bits = entry.ignored_bits;
+
+        // If the entry is valid, return the code
+        if !entry.is_unused() && compare_masked(word, entry.symbol.to_u64(), ignored_bits) {
+            return entry.code;
+        }
+
+        // Try and match first two bytes
+        let twobyte = self.codes_two_byte[word as u16 as usize];
+        if twobyte.extended_code() >= FSST_CODE_BASE {
+            return twobyte;
+        }
+
+        // Fall back to single-byte match
+        self.codes_one_byte[word as u8 as usize]
+    }
+
+    /// Compress the text using the current symbol table. Count the code occurrences
+    /// and code-pair occurrences, calculating total gain using the current compressor.
+    ///
+    /// NOTE: this is largely an unfortunate amount of copy-paste from `compress`, just to make sure
+    /// we can do all the counting in a single pass.
+    fn compress_count(&self, sample: &[u8], counter: &mut Counter) -> usize {
+        let mut gain = 0;
+        if sample.is_empty() {
+            return gain;
+        }
+
+        let mut in_ptr = sample.as_ptr();
+
+        // SAFETY: `end` will point just after the end of the `plaintext` slice.
+        let in_end = unsafe { in_ptr.byte_add(sample.len()) };
+        let in_end_sub8 = in_end as usize - 8;
+
+        let mut prev_code: u16 = FSST_CODE_MASK;
+
+        while (in_ptr as usize) < (in_end_sub8) {
+            // SAFETY: ensured in-bounds by loop condition.
+            let word: u64 = unsafe { std::ptr::read_unaligned(in_ptr as *const u64) };
+            let code = self.find_longest_symbol(word);
+            let code_u16 = code.extended_code();
+
+            // Gain increases by the symbol length if a symbol matches, or 0
+            // if an escape is emitted.
+            gain += (code.len() as usize) - ((code_u16 < 256) as usize);
+
+            // Record the single and pair counts
+            counter.record_count1(code_u16);
+            counter.record_count2(prev_code, code_u16);
+
+            // Also record the count for just extending by a single byte, but only if
+            // the symbol is not itself a single byte.
+            if code.len() > 1 {
+                let code_first_byte = self.symbols[code_u16 as usize].first_byte() as u16;
+                counter.record_count1(code_first_byte);
+                counter.record_count2(prev_code, code_first_byte);
+            }
+
+            // SAFETY: pointer bound is checked in loop condition before any access is made.
+            in_ptr = unsafe { in_ptr.byte_add(code.len() as usize) };
+
+            prev_code = code_u16;
+        }
+
+        let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) };
+        assert!(
+            remaining_bytes.is_positive(),
+            "in_ptr exceeded in_end, should not be possible"
+        );
+        let remaining_bytes = remaining_bytes as usize;
+
+        // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above,
+        // but shift data out of this word rather than advancing an input pointer and potentially reading
+        // unowned memory
+        let mut bytes = [0u8; 8];
+        unsafe {
+            // SAFETY: it is safe to read up to remaining_bytes from in_ptr, and remaining_bytes
+            //  will be <= 8 bytes.
+            std::ptr::copy_nonoverlapping(in_ptr, bytes.as_mut_ptr(), remaining_bytes);
+        }
+        let mut last_word = u64::from_le_bytes(bytes);
+
+        let mut remaining_bytes = remaining_bytes;
+
+        while remaining_bytes > 0 {
+            // SAFETY: ensured in-bounds by loop condition.
+            let code = self.find_longest_symbol(last_word);
+            let code_u16 = code.extended_code();
+
+            // Gain increases by the symbol length if a symbol matches, or 0
+            // if an escape is emitted.
+            gain += (code.len() as usize) - ((code_u16 < 256) as usize);
+
+            // Record the single and pair counts
+            counter.record_count1(code_u16);
+            counter.record_count2(prev_code, code_u16);
+
+            // Also record the count for just extending by a single byte, but only if
+            // the symbol is not itself a single byte.
+            if code.len() > 1 {
+                let code_first_byte = self.symbols[code_u16 as usize].first_byte() as u16;
+                counter.record_count1(code_first_byte);
+                counter.record_count2(prev_code, code_first_byte);
+            }
+
+            // Advance our last_word "input pointer" by shifting off the covered values.
+            let advance = code.len() as usize;
+            remaining_bytes -= advance;
+            last_word = advance_8byte_word(last_word, advance);
+
+            prev_code = code_u16;
+        }
+
+        gain
+    }
+
+    /// Using a set of counters and the existing set of symbols, build a new
+    /// set of symbols/codes that optimizes the gain over the distribution in `counter`.
+    fn optimize(
+        &mut self,
+        counters: &Counter,
+        sample_frac: usize,
+        pqueue: &mut BinaryHeap<Candidate>,
+    ) {
+        for code1 in counters.first_codes() {
+            let symbol1 = self.symbols[code1 as usize];
+            let symbol1_len = symbol1.len();
+            let count = counters.count1(code1);
+
+            // From the c++ impl:
+            // "improves both compression speed (less candidates), but also quality!!"
+            if count < (5 * sample_frac / 128) {
+                continue;
+            }
+
+            let mut gain = count * symbol1_len;
+            // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols.
+            // This helps to reduce exception counts.
+            if code1 < 256 {
+                gain *= 8;
+            }
+
+            pqueue.push(Candidate {
+                symbol: symbol1,
+                gain,
+            });
+
+            // Skip merges on last round, or when symbol cannot be extended.
+            if sample_frac >= 128 || symbol1_len == 8 {
+                continue;
+            }
+
+            for code2 in counters.second_codes(code1) {
+                let symbol2 = self.symbols[code2 as usize];
+
+                // If merging would yield a symbol of length greater than 8, skip.
+                if symbol1_len + symbol2.len() > 8 {
+                    continue;
+                }
+                let new_symbol = symbol1.concat(symbol2);
+                let gain = counters.count2(code1, code2) * new_symbol.len();
+
+                pqueue.push(Candidate {
+                    symbol: new_symbol,
+                    gain,
+                })
+            }
+        }
+
+        // clear self in advance of inserting the symbols.
+        self.clear();
+
+        // Pop the 255 best symbols.
+        let mut n_symbols = 0;
+        while !pqueue.is_empty() && n_symbols < 255 {
+            let candidate = pqueue.pop().unwrap();
+            if self.insert(candidate.symbol, candidate.symbol.len()) {
+                n_symbols += 1;
+            }
+        }
+    }
+}
+
+/// A candidate for inclusion in a symbol table.
+///
+/// This is really only useful for the `optimize` step of training.
+#[derive(Copy, Clone, Debug)]
+struct Candidate {
+    gain: usize,
+    symbol: Symbol,
+}
+
+impl Candidate {
+    fn comparable_form(&self) -> (usize, usize) {
+        (self.gain, self.symbol.len())
+    }
+}
+
+impl Eq for Candidate {}
+
+impl PartialEq<Self> for Candidate {
+    fn eq(&self, other: &Self) -> bool {
+        self.comparable_form().eq(&other.comparable_form())
+    }
+}
+
+impl PartialOrd<Self> for Candidate {
+    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
+        Some(self.cmp(other))
+    }
+}
+
+impl Ord for Candidate {
+    fn cmp(&self, other: &Self) -> Ordering {
+        let self_ord = (self.gain, self.symbol.len());
+        let other_ord = (other.gain, other.symbol.len());
+
+        self_ord.cmp(&other_ord)
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use crate::Compressor;
+    use crate::ESCAPE_CODE;
+    use crate::builder::CodesBitmap;
+
+    #[test]
+    fn test_builder() {
+        // Train a Compressor on the toy string
+        let text = b"hello hello hello hello hello";
+
+        // count of 5 is the cutoff for including a symbol in the table.
+        let table = Compressor::train(&vec![text, text, text, text, text]);
+
+        // Use the table to compress a string, see the values
+        let compressed = table.compress(text);
+
+        // Ensure that the compressed string has no escape bytes
+        assert!(compressed.iter().all(|b| *b != ESCAPE_CODE));
+
+        // Ensure that we can compress a string with no values seen at training time, with escape bytes
+        let compressed = table.compress("xyz123".as_bytes());
+        let decompressed = table.decompressor().decompress(&compressed);
+        assert_eq!(&decompressed, b"xyz123");
+        assert_eq!(
+            compressed,
+            vec![
+                ESCAPE_CODE,
+                b'x',
+                ESCAPE_CODE,
+                b'y',
+                ESCAPE_CODE,
+                b'z',
+                ESCAPE_CODE,
+                b'1',
+                ESCAPE_CODE,
+                b'2',
+                ESCAPE_CODE,
+                b'3',
+            ]
+        );
+    }
+
+    #[test]
+    fn test_bitmap() {
+        let mut map = CodesBitmap::default();
+        map.set(10);
+        map.set(100);
+        map.set(500);
+
+        let codes: Vec<u16> = map.codes().collect();
+        assert_eq!(codes, vec![10u16, 100, 500]);
+
+        // empty case
+        let map = CodesBitmap::default();
+        assert!(map.codes().collect::<Vec<_>>().is_empty());
+
+        // edge case: first bit in each block is set
+        let mut map = CodesBitmap::default();
+        (0..8).for_each(|i| map.set(64 * i));
+        assert_eq!(
+            map.codes().collect::<Vec<_>>(),
+            (0u16..8).map(|i| 64 * i).collect::<Vec<_>>(),
+        );
+
+        // Full bitmap case. There are only 512 values, so test them all
+        let mut map = CodesBitmap::default();
+        for i in 0..512 {
+            map.set(i);
+        }
+        assert_eq!(
+            map.codes().collect::<Vec<_>>(),
+            (0u16..511u16).collect::<Vec<_>>()
+        );
+    }
+
+    #[test]
+    #[should_panic(expected = "code cannot exceed")]
+    fn test_bitmap_invalid() {
+        let mut map = CodesBitmap::default();
+        map.set(512);
+    }
+
+    #[test]
+    fn test_no_duplicate_symbols() {
+        // Train on data that is likely to produce duplicate 1-byte and 2-byte candidates.
+        let text = b"aababcabcdabcde";
+        let corpus: Vec<&[u8]> = std::iter::repeat_n(text.as_slice(), 100).collect();
+        let compressor = Compressor::train(&corpus);
+
+        let symbols = compressor.symbol_table();
+        let lengths = compressor.symbol_lengths();
+
+        // Collect all 1-byte symbols and check for duplicates.
+        let one_byte: Vec<u8> = symbols
+            .iter()
+            .zip(lengths.iter())
+            .filter(|&(_, &len)| len == 1)
+            .map(|(sym, _)| sym.first_byte())
+            .collect();
+        let mut one_byte_sorted = one_byte.clone();
+        one_byte_sorted.sort();
+        one_byte_sorted.dedup();
+        assert_eq!(
+            one_byte.len(),
+            one_byte_sorted.len(),
+            "duplicate 1-byte symbols found"
+        );
+
+        // Collect all 2-byte symbols and check for duplicates.
+        let two_byte: Vec<u16> = symbols
+            .iter()
+            .zip(lengths.iter())
+            .filter(|&(_, &len)| len == 2)
+            .map(|(sym, _)| sym.first2())
+            .collect();
+        let mut two_byte_sorted = two_byte.clone();
+        two_byte_sorted.sort();
+        two_byte_sorted.dedup();
+        assert_eq!(
+            two_byte.len(),
+            two_byte_sorted.len(),
+            "duplicate 2-byte symbols found"
+        );
+    }
+}
diff --git a/vendor/fsst-rs/src/lib.rs b/vendor/fsst-rs/src/lib.rs
new file mode 100644
index 00000000000..33d843d48d8
--- /dev/null
+++ b/vendor/fsst-rs/src/lib.rs
@@ -0,0 +1,970 @@
+#![doc = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"]
+#![cfg(target_endian = "little")]
+
+/// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes.
+macro_rules! assert_sizeof {
+    ($typ:ty => $size_in_bytes:expr) => {
+        const _: [u8; $size_in_bytes] = [0; std::mem::size_of::<$typ>()];
+    };
+}
+
+use std::fmt::Debug;
+use std::fmt::Formatter;
+use std::mem::MaybeUninit;
+
+use lossy_pht::LossyPHT;
+
+mod builder;
+mod lossy_pht;
+
+pub use builder::*;
+
+/// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`Compressor`][`crate::Compressor`] and
+/// identified by an 8-bit code.
+#[derive(Copy, Clone, PartialEq, Eq, Hash)]
+pub struct Symbol(u64);
+
+assert_sizeof!(Symbol => 8);
+
+impl Symbol {
+    /// Zero value for `Symbol`.
+    pub const ZERO: Self = Self::zero();
+
+    /// Constructor for a `Symbol` from an 8-element byte slice.
+    pub fn from_slice(slice: &[u8; 8]) -> Self {
+        let num: u64 = u64::from_le_bytes(*slice);
+
+        Self(num)
+    }
+
+    /// Return a zero symbol
+    const fn zero() -> Self {
+        Self(0)
+    }
+
+    /// Create a new single-byte symbol
+    pub fn from_u8(value: u8) -> Self {
+        Self(value as u64)
+    }
+}
+
+impl Symbol {
+    /// Calculate the length of the symbol in bytes. Always a value between 1 and 8.
+    ///
+    /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols
+    /// can contain fewer bytes, padded with 0x00. There is a special case of a symbol
+    /// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000`
+    /// but we want to interpret that as a one-byte symbol containing `0x00`.
+    #[allow(clippy::len_without_is_empty)]
+    pub fn len(self) -> usize {
+        let numeric = self.0;
+        // For little-endian platforms, this counts the number of *trailing* zeros
+        let null_bytes = (numeric.leading_zeros() >> 3) as usize;
+
+        // Special case handling of a symbol with all-zeros. This is actually
+        // a 1-byte symbol containing 0x00.
+        let len = size_of::<Self>() - null_bytes;
+        if len == 0 { 1 } else { len }
+    }
+
+    /// Returns the Symbol's inner representation.
+    #[inline]
+    pub fn to_u64(self) -> u64 {
+        self.0
+    }
+
+    /// Get the first byte of the symbol as a `u8`.
+    ///
+    /// If the symbol is empty, this will return the zero byte.
+    #[inline]
+    pub fn first_byte(self) -> u8 {
+        self.0 as u8
+    }
+
+    /// Get the first two bytes of the symbol as a `u16`.
+    ///
+    /// If the Symbol is one or zero bytes, this will return `0u16`.
+    #[inline]
+    pub fn first2(self) -> u16 {
+        self.0 as u16
+    }
+
+    /// Get the first three bytes of the symbol as a `u64`.
+    ///
+    /// If the Symbol is one or zero bytes, this will return `0u64`.
+    #[inline]
+    pub fn first3(self) -> u64 {
+        self.0 & 0xFF_FF_FF
+    }
+
+    /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`.
+    pub fn concat(self, other: Self) -> Self {
+        assert!(
+            self.len() + other.len() <= 8,
+            "cannot build symbol with length > 8"
+        );
+
+        let self_len = self.len();
+
+        Self((other.0 << (8 * self_len)) | self.0)
+    }
+}
+
+impl Debug for Symbol {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        write!(f, "[")?;
+
+        let slice = &self.0.to_le_bytes()[0..self.len()];
+        for c in slice.iter().map(|c| *c as char) {
+            if ('!'..='~').contains(&c) {
+                write!(f, "{c}")?;
+            } else if c == '\n' {
+                write!(f, " \\n ")?;
+            } else if c == '\t' {
+                write!(f, " \\t ")?;
+            } else if c == ' ' {
+                write!(f, " SPACE ")?;
+            } else {
+                write!(f, " 0x{:X?} ", c as u8)?
+            }
+        }
+
+        write!(f, "]")
+    }
+}
+
+/// A packed type containing a code value, as well as metadata about the symbol referred to by
+/// the code.
+///
+/// Logically, codes can range from 0-255 inclusive. This type holds both the 8-bit code as well as
+/// other metadata bit-packed into a `u16`.
+///
+/// The bottom 8 bits contain EITHER a code for a symbol stored in the table, OR a raw byte.
+///
+/// The interpretation depends on the 9th bit: when toggled off, the value stores a raw byte, and when
+/// toggled on, it stores a code. Thus if you examine the bottom 9 bits of the `u16`, you have an extended
+/// code range, where the values 0-255 are raw bytes, and the values 256-510 represent codes 0-254. 511 is
+/// a placeholder for the invalid code here.
+///
+/// Bits 12-15 store the length of the symbol (values ranging from 0-8).
+#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
+struct Code(u16);
+
+/// Code used to indicate bytes that are not in the symbol table.
+///
+/// When compressing a string that cannot fully be expressed with the symbol table, the compressed
+/// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence
+/// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of
+/// being looked up in the symbol table.
+pub const ESCAPE_CODE: u8 = 255;
+
+/// Number of bits in the `ExtendedCode` that are used to dictate a code value.
+pub const FSST_CODE_BITS: usize = 9;
+
+/// First bit of the "length" portion of an extended code.
+pub const FSST_LEN_BITS: usize = 12;
+
+/// Maximum code value in the extended code range.
+pub const FSST_CODE_MAX: u16 = 1 << FSST_CODE_BITS;
+
+/// Maximum value for the extended code range.
+///
+/// When truncated to u8 this is code 255, which is equivalent to [`ESCAPE_CODE`].
+pub const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1;
+
+/// First code in the symbol table that corresponds to a non-escape symbol.
+pub const FSST_CODE_BASE: u16 = 256;
+
+#[allow(clippy::len_without_is_empty)]
+impl Code {
+    /// Code for an unused slot in a symbol table or index.
+    ///
+    /// This corresponds to the maximum code with a length of 1.
+    pub const UNUSED: Self = Code(FSST_CODE_MASK + (1 << 12));
+
+    /// Create a new code for a symbol of given length.
+    fn new_symbol(code: u8, len: usize) -> Self {
+        Self(code as u16 + ((len as u16) << FSST_LEN_BITS))
+    }
+
+    /// Code for a new symbol during the building phase.
+    ///
+    /// The code is remapped from 0..254 to 256...510.
+    fn new_symbol_building(code: u8, len: usize) -> Self {
+        Self(code as u16 + 256 + ((len as u16) << FSST_LEN_BITS))
+    }
+
+    /// Create a new code corresponding for an escaped byte.
+    fn new_escape(byte: u8) -> Self {
+        Self((byte as u16) + (1 << FSST_LEN_BITS))
+    }
+
+    #[inline]
+    fn code(self) -> u8 {
+        self.0 as u8
+    }
+
+    #[inline]
+    fn extended_code(self) -> u16 {
+        self.0 & 0b111_111_111
+    }
+
+    #[inline]
+    fn len(self) -> u16 {
+        self.0 >> FSST_LEN_BITS
+    }
+}
+
+impl Debug for Code {
+    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
+        f.debug_struct("TrainingCode")
+            .field("code", &(self.0 as u8))
+            .field("is_escape", &(self.0 < 256))
+            .field("len", &(self.0 >> 12))
+            .finish()
+    }
+}
+
+/// Decompressor uses a symbol table to take a stream of 8-bit codes into a string.
+#[derive(Clone)]
+pub struct Decompressor<'a> {
+    /// Slice mapping codes to symbols.
+    pub(crate) symbols: &'a [Symbol],
+
+    /// Slice containing the length of each symbol in the `symbols` slice.
+    pub(crate) lengths: &'a [u8],
+}
+
+impl<'a> Decompressor<'a> {
+    /// Returns a new decompressor that uses the provided symbol table.
+    ///
+    /// # Panics
+    ///
+    /// If the provided symbol table has length greater than 256
+    pub fn new(symbols: &'a [Symbol], lengths: &'a [u8]) -> Self {
+        assert!(
+            symbols.len() < FSST_CODE_BASE as usize,
+            "symbol table cannot have size exceeding 255"
+        );
+
+        Self { symbols, lengths }
+    }
+
+    /// Returns an upper bound on the size of the decompressed data.
+    pub fn max_decompression_capacity(&self, compressed: &[u8]) -> usize {
+        size_of::<Symbol>() * (compressed.len() + 1)
+    }
+
+    /// Decompress a slice of codes into a provided buffer.
+    ///
+    /// The provided `decoded` buffer must be at least the size of the decoded data, plus
+    /// an additional 7 bytes.
+    ///
+    /// ## Panics
+    ///
+    /// If the caller fails to provide sufficient capacity in the decoded buffer. An upper bound
+    /// on the required capacity can be obtained by calling [`Self::max_decompression_capacity`].
+    ///
+    /// ## Example
+    ///
+    /// ```
+    /// use fsst::{Symbol, Compressor, CompressorBuilder};
+    /// let compressor = {
+    ///     let mut builder = CompressorBuilder::new();
+    ///     builder.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', b'o', b'o', b'o']), 8);
+    ///     builder.build()
+    /// };
+    ///
+    /// let decompressor = compressor.decompressor();
+    ///
+    /// let mut decompressed = Vec::with_capacity(8 + 7);
+    ///
+    /// let len = decompressor.decompress_into(&[0], decompressed.spare_capacity_mut());
+    /// assert_eq!(len, 8);
+    /// unsafe { decompressed.set_len(len) };
+    /// assert_eq!(&decompressed, "helloooo".as_bytes());
+    /// ```
+    pub fn decompress_into(&self, compressed: &[u8], decoded: &mut [MaybeUninit<u8>]) -> usize {
+        // Ensure the target buffer is at least half the size of the input buffer.
+        // This is the theortical smallest a valid target can be, and occurs when
+        // every input code is an escape.
+        assert!(
+            decoded.len() >= compressed.len() / 2,
+            "decoded is smaller than lower-bound decompressed size"
+        );
+
+        unsafe {
+            let mut in_ptr = compressed.as_ptr();
+            let _in_begin = in_ptr;
+            let in_end = in_ptr.add(compressed.len());
+
+            let mut out_ptr: *mut u8 = decoded.as_mut_ptr().cast();
+            let out_begin = out_ptr.cast_const();
+            let out_end = decoded.as_ptr().add(decoded.len()).cast::<u8>();
+
+            macro_rules! store_next_symbol {
+                ($code:expr) => {{
+                    out_ptr
+                        .cast::<u64>()
+                        .write_unaligned(self.symbols.get_unchecked($code as usize).to_u64());
+                    out_ptr = out_ptr.add(*self.lengths.get_unchecked($code as usize) as usize);
+                }};
+            }
+
+            // First we try loading 8 bytes at a time.
+            if decoded.len() >= 8 * size_of::<Symbol>() && compressed.len() >= 8 {
+                // Extract the loop condition since the compiler fails to do so
+                let block_out_end = out_end.sub(8 * size_of::<Symbol>());
+                let block_in_end = in_end.sub(8);
+
+                while out_ptr.cast_const() <= block_out_end && in_ptr < block_in_end {
+                    // Note that we load a little-endian u64 here.
+                    let next_block = in_ptr.cast::<u64>().read_unaligned();
+                    let escape_mask = (next_block & 0x8080808080808080)
+                        & ((((!next_block) & 0x7F7F7F7F7F7F7F7F) + 0x7F7F7F7F7F7F7F7F)
+                            ^ 0x8080808080808080);
+
+                    // If there are no escape codes, we write each symbol one by one.
+                    if escape_mask == 0 {
+                        let code = (next_block & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 8) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 16) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 24) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 32) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 40) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 48) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        let code = ((next_block >> 56) & 0xFF) as u8;
+                        store_next_symbol!(code);
+                        in_ptr = in_ptr.add(8);
+                    } else {
+                        // Otherwise, find the first escape code and write the symbols up to that point.
+                        let first_escape_pos = escape_mask.trailing_zeros() >> 3; // Divide bits to bytes
+                        debug_assert!(first_escape_pos < 8);
+                        match first_escape_pos {
+                            7 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 8) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 16) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 24) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 32) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 40) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 48) & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                in_ptr = in_ptr.add(7);
+                            }
+                            6 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 8) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 16) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 24) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 32) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 40) & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                let escaped = ((next_block >> 56) & 0xFF) as u8;
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+
+                                in_ptr = in_ptr.add(8);
+                            }
+                            5 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 8) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 16) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 24) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 32) & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                let escaped = ((next_block >> 48) & 0xFF) as u8;
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+
+                                in_ptr = in_ptr.add(7);
+                            }
+                            4 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 8) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 16) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 24) & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                let escaped = ((next_block >> 40) & 0xFF) as u8;
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+
+                                in_ptr = in_ptr.add(6);
+                            }
+                            3 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 8) & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 16) & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                let escaped = ((next_block >> 32) & 0xFF) as u8;
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+
+                                in_ptr = in_ptr.add(5);
+                            }
+                            2 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+                                let code = ((next_block >> 8) & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                let escaped = ((next_block >> 24) & 0xFF) as u8;
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+
+                                in_ptr = in_ptr.add(4);
+                            }
+                            1 => {
+                                let code = (next_block & 0xFF) as u8;
+                                store_next_symbol!(code);
+
+                                let escaped = ((next_block >> 16) & 0xFF) as u8;
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+
+                                in_ptr = in_ptr.add(3);
+                            }
+                            0 => {
+                                // Otherwise, we actually need to decompress the next byte
+                                // Extract the second byte from the u32
+                                let escaped = ((next_block >> 8) & 0xFF) as u8;
+                                in_ptr = in_ptr.add(2);
+                                out_ptr.write(escaped);
+                                out_ptr = out_ptr.add(1);
+                            }
+                            _ => unreachable!(),
+                        }
+                    }
+                }
+            }
+
+            // Otherwise, fall back to 1-byte reads.
+            while out_end.offset_from(out_ptr) > size_of::<Symbol>() as isize && in_ptr < in_end {
+                let code = in_ptr.read();
+                in_ptr = in_ptr.add(1);
+
+                if code == ESCAPE_CODE {
+                    out_ptr.write(in_ptr.read());
+                    in_ptr = in_ptr.add(1);
+                    out_ptr = out_ptr.add(1);
+                } else {
+                    store_next_symbol!(code);
+                }
+            }
+
+            assert_eq!(
+                in_ptr, in_end,
+                "decompression should exhaust input before output"
+            );
+
+            out_ptr.offset_from(out_begin) as usize
+        }
+    }
+
+    /// Decompress a byte slice that was previously returned by a compressor using the same symbol
+    /// table into a new vector of bytes.
+    pub fn decompress(&self, compressed: &[u8]) -> Vec<u8> {
+        let mut decoded = Vec::with_capacity(self.max_decompression_capacity(compressed) + 7);
+
+        let len = self.decompress_into(compressed, decoded.spare_capacity_mut());
+        // SAFETY: len bytes have now been initialized by the decompressor.
+        unsafe { decoded.set_len(len) };
+        decoded
+    }
+}
+
+/// A compressor that uses a symbol table to greedily compress strings.
+///
+/// The `Compressor` is the central component of FSST. You can create a compressor either by
+/// default (i.e. an empty compressor), or by [training][`Self::train`] it on an input corpus of text.
+///
+/// Example usage:
+///
+/// ```
+/// use fsst::{Symbol, Compressor, CompressorBuilder};
+/// let compressor = {
+///     let mut builder = CompressorBuilder::new();
+///     builder.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0]), 5);
+///     builder.build()
+/// };
+///
+/// let compressed = compressor.compress("hello".as_bytes());
+/// assert_eq!(compressed, vec![0u8]);
+/// ```
+#[derive(Clone)]
+pub struct Compressor {
+    /// Table mapping codes to symbols.
+    pub(crate) symbols: Vec<Symbol>,
+
+    /// Length of each symbol, values range from 1-8.
+    pub(crate) lengths: Vec<u8>,
+
+    /// The number of entries in the symbol table that have been populated, not counting
+    /// the escape values.
+    pub(crate) n_symbols: u8,
+
+    /// Inverted index mapping 2-byte symbols to codes
+    codes_two_byte: Vec<Code>,
+
+    /// Limit of no suffixes.
+    has_suffix_code: u8,
+
+    /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more
+    lossy_pht: LossyPHT,
+}
+
+/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s.
+///
+/// The symbol table is trained on a corpus of data in the form of a single byte array, building up
+/// a mapping of 1-byte "codes" to sequences of up to 8 plaintext bytes, or "symbols".
+impl Compressor {
+    /// Using the symbol table, runs a single cycle of compression on an input word, writing
+    /// the output into `out_ptr`.
+    ///
+    /// # Returns
+    ///
+    /// This function returns a tuple of (advance_in, advance_out) with the number of bytes
+    /// for the caller to advance the input and output pointers.
+    ///
+    /// `advance_in` is the number of bytes to advance the input pointer before the next call.
+    ///
+    /// `advance_out` is the number of bytes to advance `out_ptr` before the next call.
+    ///
+    /// # Safety
+    ///
+    /// `out_ptr` must never be NULL or otherwise point to invalid memory.
+    pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) {
+        // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and
+        // if it isn't, it will be overwritten anyway.
+        //
+        // SAFETY: caller ensures out_ptr is not null
+        let first_byte = word as u8;
+        // SAFETY: out_ptr is not null
+        unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) };
+
+        // First, check the two_bytes table
+        let code_twobyte = self.codes_two_byte[word as u16 as usize];
+
+        if code_twobyte.code() < self.has_suffix_code {
+            // 2 byte code without having to worry about longer matches.
+            // SAFETY: out_ptr is not null.
+            unsafe { std::ptr::write(out_ptr, code_twobyte.code()) };
+
+            // Advance input by symbol length (2) and output by a single code byte
+            (2, 1)
+        } else {
+            // Probe the hash table
+            let entry = self.lossy_pht.lookup(word);
+
+            // Now, downshift the `word` and the `entry` to see if they align.
+            let ignored_bits = entry.ignored_bits;
+            if entry.code != Code::UNUSED
+                && compare_masked(word, entry.symbol.to_u64(), ignored_bits)
+            {
+                // Advance the input by the symbol length (variable) and the output by one code byte
+                // SAFETY: out_ptr is not null.
+                unsafe { std::ptr::write(out_ptr, entry.code.code()) };
+                (entry.code.len() as usize, 1)
+            } else {
+                // SAFETY: out_ptr is not null
+                unsafe { std::ptr::write(out_ptr, code_twobyte.code()) };
+
+                // Advance the input by the symbol length (variable) and the output by either 1
+                // byte (if was one-byte code) or two bytes (escape).
+                (
+                    code_twobyte.len() as usize,
+                    // Predicated version of:
+                    //
+                    // if entry.code >= 256 {
+                    //      2
+                    // } else {
+                    //      1
+                    // }
+                    1 + (code_twobyte.extended_code() >> 8) as usize,
+                )
+            }
+        }
+    }
+
+    /// Compress many lines in bulk.
+    pub fn compress_bulk(&self, lines: &Vec<&[u8]>) -> Vec<Vec<u8>> {
+        let mut res = Vec::new();
+
+        for line in lines {
+            res.push(self.compress(line));
+        }
+
+        res
+    }
+
+    /// Compress a string, writing its result into a target buffer.
+    ///
+    /// The target buffer is a byte vector that must have capacity large enough
+    /// to hold the encoded data.
+    ///
+    /// When this call returns, `values` will hold the compressed bytes and have
+    /// its length set to the length of the compressed text.
+    ///
+    /// ```
+    /// use fsst::{Compressor, CompressorBuilder, Symbol};
+    ///
+    /// let mut compressor = CompressorBuilder::new();
+    /// assert!(compressor.insert(Symbol::from_slice(b"aaaaaaaa"), 8));
+    ///
+    /// let compressor = compressor.build();
+    ///
+    /// let mut compressed_values = Vec::with_capacity(1_024);
+    ///
+    /// // SAFETY: we have over-sized compressed_values.
+    /// unsafe {
+    ///     compressor.compress_into(b"aaaaaaaa", &mut compressed_values);
+    /// }
+    ///
+    /// assert_eq!(compressed_values, vec![0u8]);
+    /// ```
+    ///
+    /// # Safety
+    ///
+    /// It is up to the caller to ensure the provided buffer is large enough to hold
+    /// all encoded data.
+    pub unsafe fn compress_into(&self, plaintext: &[u8], values: &mut Vec<u8>) {
+        let mut in_ptr = plaintext.as_ptr();
+        let mut out_ptr = values.as_mut_ptr();
+
+        // SAFETY: `end` will point just after the end of the `plaintext` slice.
+        let in_end = unsafe { in_ptr.byte_add(plaintext.len()) };
+        let in_end_sub8 = in_end as usize - 8;
+        // SAFETY: `end` will point just after the end of the `values` allocation.
+        let out_end = unsafe { out_ptr.byte_add(values.capacity()) };
+
+        while (in_ptr as usize) <= in_end_sub8 && out_ptr < out_end {
+            // SAFETY: pointer ranges are checked in the loop condition
+            unsafe {
+                // Load a full 8-byte word of data from in_ptr.
+                // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though.
+                let word: u64 = std::ptr::read_unaligned(in_ptr as *const u64);
+                let (advance_in, advance_out) = self.compress_word(word, out_ptr);
+                in_ptr = in_ptr.byte_add(advance_in);
+                out_ptr = out_ptr.byte_add(advance_out);
+            };
+        }
+
+        let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) };
+        assert!(
+            out_ptr < out_end || remaining_bytes == 0,
+            "output buffer sized too small"
+        );
+
+        let remaining_bytes = remaining_bytes as usize;
+
+        // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above,
+        // but shift data out of this word rather than advancing an input pointer and potentially reading
+        // unowned memory.
+        let mut bytes = [0u8; 8];
+        // SAFETY: remaining_bytes <= 8
+        unsafe { std::ptr::copy_nonoverlapping(in_ptr, bytes.as_mut_ptr(), remaining_bytes) };
+        let mut last_word = u64::from_le_bytes(bytes);
+
+        while in_ptr < in_end && out_ptr < out_end {
+            // Load a full 8-byte word of data from in_ptr.
+            // SAFETY: caller asserts in_ptr is not null
+            let (advance_in, advance_out) = unsafe { self.compress_word(last_word, out_ptr) };
+            // SAFETY: pointer ranges are checked in the loop condition
+            unsafe {
+                in_ptr = in_ptr.add(advance_in);
+                out_ptr = out_ptr.add(advance_out);
+            }
+
+            last_word = advance_8byte_word(last_word, advance_in);
+        }
+
+        // in_ptr should have exceeded in_end
+        assert!(
+            in_ptr >= in_end,
+            "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()"
+        );
+
+        assert!(out_ptr <= out_end, "output buffer sized too small");
+
+        // SAFETY: out_ptr is derived from the `values` allocation.
+        let bytes_written = unsafe { out_ptr.offset_from(values.as_ptr()) };
+        assert!(
+            bytes_written >= 0,
+            "out_ptr ended before it started, not possible"
+        );
+
+        // SAFETY: we have initialized `bytes_written` values in the output buffer.
+        unsafe { values.set_len(bytes_written as usize) };
+    }
+
+    /// Use the symbol table to compress the plaintext into a sequence of codes and escapes.
+    pub fn compress(&self, plaintext: &[u8]) -> Vec<u8> {
+        if plaintext.is_empty() {
+            return Vec::new();
+        }
+
+        let mut buffer = Vec::with_capacity(plaintext.len() * 2);
+
+        // SAFETY: the largest compressed size would be all escapes == 2*plaintext_len
+        unsafe { self.compress_into(plaintext, &mut buffer) };
+
+        buffer
+    }
+
+    /// Access the decompressor that can be used to decompress strings emitted from this
+    /// `Compressor` instance.
+    pub fn decompressor(&self) -> Decompressor<'_> {
+        Decompressor::new(self.symbol_table(), self.symbol_lengths())
+    }
+
+    /// Returns a readonly slice of the current symbol table.
+    ///
+    /// The returned slice will have length of `n_symbols`.
+    pub fn symbol_table(&self) -> &[Symbol] {
+        &self.symbols[0..self.n_symbols as usize]
+    }
+
+    /// Returns a readonly slice where index `i` contains the
+    /// length of the symbol represented by code `i`.
+    ///
+    /// Values range from 1-8.
+    pub fn symbol_lengths(&self) -> &[u8] {
+        &self.lengths[0..self.n_symbols as usize]
+    }
+
+    /// Rebuild a compressor from an existing symbol table.
+    ///
+    /// This will not attempt to optimize or re-order the codes.
+    pub fn rebuild_from(symbols: impl AsRef<[Symbol]>, symbol_lens: impl AsRef<[u8]>) -> Self {
+        let symbols = symbols.as_ref();
+        let symbol_lens = symbol_lens.as_ref();
+
+        assert_eq!(
+            symbols.len(),
+            symbol_lens.len(),
+            "symbols and lengths differ"
+        );
+        assert!(
+            symbols.len() <= 255,
+            "symbol table len must be <= 255, was {}",
+            symbols.len()
+        );
+        validate_symbol_order(symbol_lens);
+
+        // Insert the symbols in their given order into the FSST lookup structures.
+        let symbols = symbols.to_vec();
+        let lengths = symbol_lens.to_vec();
+        let mut lossy_pht = LossyPHT::new();
+
+        let mut codes_one_byte = vec![Code::UNUSED; 256];
+
+        // Insert all of the one byte symbols first.
+        for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() {
+            if len == 1 {
+                codes_one_byte[symbol.first_byte() as usize] = Code::new_symbol(code as u8, 1);
+            }
+        }
+
+        // Initialize the codes_two_byte table to be all escapes
+        let mut codes_two_byte = vec![Code::UNUSED; 65_536];
+
+        // Insert the two byte symbols, possibly overwriting slots for one-byte symbols and escapes.
+        for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() {
+            match len {
+                2 => {
+                    codes_two_byte[symbol.first2() as usize] = Code::new_symbol(code as u8, 2);
+                }
+                3.. => {
+                    assert!(
+                        lossy_pht.insert(symbol, len as usize, code as u8),
+                        "rebuild symbol insertion into PHT must succeed"
+                    );
+                }
+                _ => { /* Covered by the 1-byte loop above. */ }
+            }
+        }
+
+        // Build the finished codes_two_byte table, subbing in unused positions with the
+        // codes_one_byte value similar to what we do in CompressBuilder::finalize.
+        for (symbol, code) in codes_two_byte.iter_mut().enumerate() {
+            if *code == Code::UNUSED {
+                *code = codes_one_byte[symbol & 0xFF];
+            }
+        }
+
+        // Find the position of the first 2-byte code that has a suffix later in the table
+        let mut has_suffix_code = 0u8;
+        for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() {
+            if len != 2 {
+                break;
+            }
+            let rest = &symbols[code..];
+            if rest
+                .iter()
+                .any(|&other| other.len() > 2 && symbol.first2() == other.first2())
+            {
+                has_suffix_code = code as u8;
+                break;
+            }
+        }
+
+        Compressor {
+            n_symbols: symbols.len() as u8,
+            symbols,
+            lengths,
+            codes_two_byte,
+            lossy_pht,
+            has_suffix_code,
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn advance_8byte_word(word: u64, bytes: usize) -> u64 {
+    // shift the word off the low-end, because little endian means the first
+    // char is stored in the LSB.
+    //
+    // Note that even though this looks like it branches, Rust compiles this to a
+    // conditional move instruction. See `<https://godbolt.org/z/Pbvre65Pq>`
+    if bytes == 8 { 0 } else { word >> (8 * bytes) }
+}
+
+fn validate_symbol_order(symbol_lens: &[u8]) {
+    // Ensure that the symbol table is ordered by length, 23456781
+    let mut expected = 2;
+    for (idx, &len) in symbol_lens.iter().enumerate() {
+        if expected == 1 {
+            assert_eq!(
+                len, 1,
+                "symbol code={idx} should be one byte, was {len} bytes"
+            );
+        } else {
+            if len == 1 {
+                expected = 1;
+            }
+
+            // we're in the non-zero portion.
+            assert!(
+                len >= expected,
+                "symbol code={idx} breaks violates FSST symbol table ordering"
+            );
+            expected = len;
+        }
+    }
+}
+
+#[inline]
+pub(crate) fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool {
+    let mask = u64::MAX >> ignored_bits;
+    (left & mask) == right
+}
+
+#[cfg(test)]
+mod test {
+    use std::iter;
+    use std::mem;
+
+    use super::*;
+    #[test]
+    fn test_stuff() {
+        let compressor = {
+            let mut builder = CompressorBuilder::new();
+            builder.insert(Symbol::from_slice(b"helloooo"), 8);
+            builder.build()
+        };
+
+        let decompressor = compressor.decompressor();
+
+        let mut decompressed = Vec::with_capacity(8 + 7);
+
+        let len = decompressor.decompress_into(&[0], decompressed.spare_capacity_mut());
+        assert_eq!(len, 8);
+        unsafe { decompressed.set_len(len) };
+        assert_eq!(&decompressed, "helloooo".as_bytes());
+    }
+
+    #[test]
+    fn test_symbols_good() {
+        let symbols_u64: &[u64] = &[
+            24931, 25698, 25442, 25699, 25186, 25444, 24932, 25188, 25185, 25441, 25697, 25700,
+            24929, 24930, 25443, 25187, 6513249, 6512995, 6578786, 6513761, 6513507, 6382434,
+            6579042, 6512994, 6447460, 6447969, 6382178, 6579041, 6512993, 6448226, 6513250,
+            6579297, 6513506, 6447459, 6513764, 6447458, 6578529, 6382180, 6513762, 6447714,
+            6579299, 6513508, 6382436, 6513763, 6578532, 6381924, 6448228, 6579300, 6381921,
+            6382690, 6382179, 6447713, 6447972, 6513505, 6447457, 6382692, 6513252, 6578785,
+            6578787, 6578531, 6448225, 6382177, 6382433, 6578530, 6448227, 6381922, 6578788,
+            6579044, 6382691, 6512996, 6579043, 6579298, 6447970, 6447716, 6447971, 6381923,
+            6447715, 97, 98, 100, 99, 97, 98, 99, 100,
+        ];
+        let symbols: &[Symbol] = unsafe { mem::transmute(symbols_u64) };
+        let lens: Vec<u8> = iter::repeat_n(2u8, 16)
+            .chain(iter::repeat_n(3u8, 61))
+            .chain(iter::repeat_n(1u8, 8))
+            .collect();
+
+        let compressor = Compressor::rebuild_from(symbols, lens);
+        let built_symbols: &[u64] = unsafe { mem::transmute(compressor.symbol_table()) };
+        assert_eq!(built_symbols, symbols_u64);
+    }
+
+    #[should_panic(expected = "assertion `left == right` failed")]
+    #[test]
+    fn test_symbols_bad() {
+        let symbols: &[u64] = &[
+            24931, 25698, 25442, 25699, 25186, 25444, 24932, 25188, 25185, 25441, 25697, 25700,
+            24929, 24930, 25443, 25187, 6513249, 6512995, 6578786, 6513761, 6513507, 6382434,
+            6579042, 6512994, 6447460, 6447969, 6382178, 6579041, 6512993, 6448226, 6513250,
+            6579297, 6513506, 6447459, 6513764, 6447458, 6578529, 6382180, 6513762, 6447714,
+            6579299, 6513508, 6382436, 6513763, 6578532, 6381924, 6448228, 6579300, 6381921,
+            6382690, 6382179, 6447713, 6447972, 6513505, 6447457, 6382692, 6513252, 6578785,
+            6578787, 6578531, 6448225, 6382177, 6382433, 6578530, 6448227, 6381922, 6578788,
+            6579044, 6382691, 6512996, 6579043, 6579298, 6447970, 6447716, 6447971, 6381923,
+            6447715, 97, 98, 100, 99, 97, 98, 99, 100,
+        ];
+        let lens: Vec<u8> = iter::repeat_n(2u8, 16)
+            .chain(iter::repeat_n(3u8, 61))
+            .chain(iter::repeat_n(1u8, 8))
+            .collect();
+
+        let mut builder = CompressorBuilder::new();
+        for (symbol, len) in symbols.iter().zip(lens.iter()) {
+            let symbol = Symbol::from_slice(&symbol.to_le_bytes());
+            builder.insert(symbol, *len as usize);
+        }
+        let compressor = builder.build();
+        let built_symbols: &[u64] = unsafe { mem::transmute(compressor.symbol_table()) };
+        assert_eq!(built_symbols, symbols);
+    }
+}
diff --git a/vendor/fsst-rs/src/lossy_pht.rs b/vendor/fsst-rs/src/lossy_pht.rs
new file mode 100644
index 00000000000..83a32030be4
--- /dev/null
+++ b/vendor/fsst-rs/src/lossy_pht.rs
@@ -0,0 +1,128 @@
+use std::fmt::Debug;
+
+use crate::Code;
+use crate::Symbol;
+use crate::builder::fsst_hash;
+
+/// Size of the perfect hash table.
+///
+/// NOTE: this differs from the paper, which recommends a 64KB total
+/// table size. The paper does not account for the fact that most
+/// vendors split the L1 cache into 32KB of instruction and 32KB of data.
+pub const HASH_TABLE_SIZE: usize = 1 << 11;
+
+/// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`].
+///
+/// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See
+/// the module documentation for a link to the paper.
+#[derive(Clone, Debug)]
+#[repr(C)]
+pub(crate) struct TableEntry {
+    /// Symbol, piece of a string, 8 bytes or fewer.
+    pub(crate) symbol: Symbol,
+
+    /// Code and associated metadata for the symbol
+    pub(crate) code: Code,
+
+    /// Number of ignored bits in `symbol`.
+    ///
+    /// This is equivalent to `64 - 8 * code.len()` but is pre-computed to save a few instructions in
+    /// the compression loop.
+    pub(crate) ignored_bits: u16,
+}
+
+assert_sizeof!(TableEntry => 16);
+
+impl TableEntry {
+    pub(crate) fn is_unused(&self) -> bool {
+        self.code == Code::UNUSED
+    }
+}
+
+/// Lossy Perfect Hash Table implementation for compression.
+///
+/// This implements the "Lossy Perfect Hash Table" described in Section 5 of the paper.
+///
+/// It is so-called because the `insert` operation for a symbol may fail, if another symbol is
+/// already occupying the slot.
+///
+/// If insertions are made from highest-gain to lowest and from longest-symbol to shortest, then
+/// we can say that any failed insert is not a big loss, because its slot is being held by a higher-gain
+/// symbol. Note that because other code in this crate calls `insert` in the pop-order of a max heap,
+/// this holds.
+#[derive(Clone, Debug)]
+pub(crate) struct LossyPHT {
+    /// Hash table slots. Used for strings that are 3 bytes or more.
+    slots: Vec<TableEntry>,
+}
+
+impl LossyPHT {
+    /// Construct a new empty lossy perfect hash table
+    pub(crate) fn new() -> Self {
+        let slots = vec![
+            TableEntry {
+                symbol: Symbol::ZERO,
+                code: Code::UNUSED,
+                ignored_bits: 64,
+            };
+            HASH_TABLE_SIZE
+        ];
+
+        Self { slots }
+    }
+
+    /// Try and insert the (symbol, code) pair into the table.
+    ///
+    /// If there is a collision, we keep the current thing and reject the write.
+    ///
+    /// # Returns
+    ///
+    /// True if the symbol was inserted into the table, false if it was rejected due to collision.
+    pub(crate) fn insert(&mut self, symbol: Symbol, len: usize, code: u8) -> bool {
+        let prefix_3bytes = symbol.to_u64() & 0xFF_FF_FF;
+        let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
+        let entry = &mut self.slots[slot];
+        if !entry.is_unused() {
+            false
+        } else {
+            entry.symbol = symbol;
+            entry.code = Code::new_symbol_building(code, len);
+            entry.ignored_bits = (64 - 8 * symbol.len()) as u16;
+            true
+        }
+    }
+
+    /// Given a new code mapping, rewrite the codes into the new code range.
+    pub(crate) fn renumber(&mut self, new_codes: &[u8]) {
+        for slot in self.slots.iter_mut() {
+            if slot.code != Code::UNUSED {
+                let old_code = slot.code.code();
+                let new_code = new_codes[old_code as usize];
+                let len = slot.code.len();
+                slot.code = Code::new_symbol(new_code, len as usize);
+            }
+        }
+    }
+
+    /// Remove the symbol from the hashtable, if it exists.
+    pub(crate) fn remove(&mut self, symbol: Symbol) {
+        let prefix_3bytes = symbol.to_u64() & 0xFF_FF_FF;
+        let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
+        self.slots[slot].code = Code::UNUSED;
+    }
+
+    #[inline]
+    pub(crate) fn lookup(&self, word: u64) -> &TableEntry {
+        let prefix_3bytes = word & 0xFF_FF_FF;
+        let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1);
+
+        // SAFETY: the slot is guaranteed to between [0, HASH_TABLE_SIZE).
+        unsafe { self.slots.get_unchecked(slot) }
+    }
+}
+
+impl Default for LossyPHT {
+    fn default() -> Self {
+        Self::new()
+    }
+}

From 93e5df7a6e7f26f14b2da0fc5e2f53cb382e5462 Mon Sep 17 00:00:00 2001
From: Joe Isaacs <joe.isaacs@live.co.uk>
Date: Thu, 12 Mar 2026 14:34:02 +0000
Subject: [PATCH 2/2] wip: fsst dedup

Signed-off-by: Joe Isaacs <joe.isaacs@live.co.uk>
---
 Cargo.toml                      | 2 +-
 vendor/fsst-rs/Cargo.toml       | 1 +
 vendor/fsst-rs/src/builder.rs   | 3 +++
 vendor/fsst-rs/src/lib.rs       | 3 +++
 vendor/fsst-rs/src/lossy_pht.rs | 3 +++
 5 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/Cargo.toml b/Cargo.toml
index d8d111f0822..71a58ae75e9 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -144,7 +144,7 @@ enum-iterator = "2.0.0"
 env_logger = "0.11"
 fastlanes = "0.5"
 flatbuffers = "25.2.10"
-fsst-rs = { path = "vendor/fsst-rs" }
+fsst-rs = { version = "0.5.6", path = "vendor/fsst-rs" }
 futures = { version = "0.3.31", default-features = false }
 fuzzy-matcher = "0.3"
 get_dir = "0.5.0"
diff --git a/vendor/fsst-rs/Cargo.toml b/vendor/fsst-rs/Cargo.toml
index 666a2166c7a..0f8488766a6 100644
--- a/vendor/fsst-rs/Cargo.toml
+++ b/vendor/fsst-rs/Cargo.toml
@@ -10,6 +10,7 @@ repository = "https://github.com/spiraldb/fsst"
 readme = "README.md"
 keywords = ["compression", "fsst"]
 categories = ["compression"]
+publish = false
 
 [lib]
 name = "fsst"
diff --git a/vendor/fsst-rs/src/builder.rs b/vendor/fsst-rs/src/builder.rs
index f2b911c8b5b..4ef9b9b220c 100644
--- a/vendor/fsst-rs/src/builder.rs
+++ b/vendor/fsst-rs/src/builder.rs
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright SpiralDB Developers
+
 //! Functions and types used for building a [`Compressor`] from a corpus of text.
 //!
 //! This module implements the logic from Algorithm 3 of the [FSST Paper].
diff --git a/vendor/fsst-rs/src/lib.rs b/vendor/fsst-rs/src/lib.rs
index 33d843d48d8..74aa795105c 100644
--- a/vendor/fsst-rs/src/lib.rs
+++ b/vendor/fsst-rs/src/lib.rs
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright SpiralDB Developers
+
 #![doc = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"]
 #![cfg(target_endian = "little")]
 
diff --git a/vendor/fsst-rs/src/lossy_pht.rs b/vendor/fsst-rs/src/lossy_pht.rs
index 83a32030be4..efc765ad24e 100644
--- a/vendor/fsst-rs/src/lossy_pht.rs
+++ b/vendor/fsst-rs/src/lossy_pht.rs
@@ -1,3 +1,6 @@
+// SPDX-License-Identifier: Apache-2.0
+// SPDX-FileCopyrightText: Copyright SpiralDB Developers
+
 use std::fmt::Debug;
 
 use crate::Code;