From dc7c5af932bdef45296cc4f1f6f7aed74c98dfda Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 12 Mar 2026 13:57:27 +0000 Subject: [PATCH 1/2] wip: fsst dedup Signed-off-by: Joe Isaacs --- Cargo.lock | 2 - Cargo.toml | 4 +- encodings/fsst/examples/fsst_symbol_table.rs | 59 ++ encodings/fsst/src/array.rs | 76 ++ vendor/fsst-rs/Cargo.toml | 55 + vendor/fsst-rs/LICENSE | 201 ++++ vendor/fsst-rs/src/builder.rs | 993 +++++++++++++++++++ vendor/fsst-rs/src/lib.rs | 970 ++++++++++++++++++ vendor/fsst-rs/src/lossy_pht.rs | 128 +++ 9 files changed, 2485 insertions(+), 3 deletions(-) create mode 100644 encodings/fsst/examples/fsst_symbol_table.rs create mode 100644 vendor/fsst-rs/Cargo.toml create mode 100644 vendor/fsst-rs/LICENSE create mode 100644 vendor/fsst-rs/src/builder.rs create mode 100644 vendor/fsst-rs/src/lib.rs create mode 100644 vendor/fsst-rs/src/lossy_pht.rs diff --git a/Cargo.lock b/Cargo.lock index d380a3d6229..6954cf3cdb1 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3701,8 +3701,6 @@ dependencies = [ [[package]] name = "fsst-rs" version = "0.5.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "561f2458a3407836ab8f1acc9113b8cda91b9d6378ba8dad13b2fe1a1d3af5ce" [[package]] name = "fst" diff --git a/Cargo.toml b/Cargo.toml index 0da5ee805ba..d8d111f0822 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -48,6 +48,8 @@ members = [ "encodings/zigzag", "encodings/zstd", "encodings/bytebool", + # Vendored dependencies + "vendor/fsst-rs", # Benchmarks "benchmarks/lance-bench", "benchmarks/compress-bench", @@ -142,7 +144,7 @@ enum-iterator = "2.0.0" env_logger = "0.11" fastlanes = "0.5" flatbuffers = "25.2.10" -fsst-rs = "0.5.5" +fsst-rs = { path = "vendor/fsst-rs" } futures = { version = "0.3.31", default-features = false } fuzzy-matcher = "0.3" get_dir = "0.5.0" diff --git a/encodings/fsst/examples/fsst_symbol_table.rs b/encodings/fsst/examples/fsst_symbol_table.rs new file mode 100644 index 00000000000..c1cad6e3b10 --- /dev/null +++ b/encodings/fsst/examples/fsst_symbol_table.rs @@ -0,0 +1,59 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Reads lines from stdin, trains an FSST symbol table, and prints it. +//! +//! Usage: +//! cat urls.txt | cargo run -p vortex-fsst --example fsst_symbol_table +//! duckdb -csv -noheader -c "SELECT URL FROM 'hits_0.parquet' LIMIT 100000" | cargo run ... + +#![allow(clippy::expect_used)] + +use std::io; +use std::io::BufRead; + +use vortex_array::arrays::VarBinArray; +use vortex_array::dtype::DType; +use vortex_array::dtype::Nullability; +use vortex_fsst::fsst_compress; +use vortex_fsst::fsst_train_compressor; + +fn main() { + let stdin = io::stdin(); + let lines: Vec>> = stdin + .lock() + .lines() + .map(|l| { + l.expect("failed to read line") + .into_bytes() + .into_boxed_slice() + }) + .map(Some) + .collect(); + + let n = lines.len(); + eprintln!("Read {n} lines from stdin"); + + let varbin = VarBinArray::from_iter(lines, DType::Utf8(Nullability::NonNullable)); + let compressor = fsst_train_compressor(&varbin); + let fsst_array = fsst_compress(&varbin, &compressor); + + print!("{}", fsst_array.format_symbol_table()); + + // Report duplicate symbols in the table. + let symbols = compressor.symbol_table(); + let lengths = compressor.symbol_lengths(); + let total = symbols.len(); + let mut keys: Vec<(u64, u8)> = symbols + .iter() + .zip(lengths.iter()) + .map(|(sym, &len)| (sym.to_u64(), len)) + .collect(); + keys.sort(); + let unique_count = { + keys.dedup(); + keys.len() + }; + let duplicates = total - unique_count; + eprintln!("Symbol table: {total} symbols, {duplicates} duplicates"); +} diff --git a/encodings/fsst/src/array.rs b/encodings/fsst/src/array.rs index 2f10b0b1a47..4ee89b73ee8 100644 --- a/encodings/fsst/src/array.rs +++ b/encodings/fsst/src/array.rs @@ -3,6 +3,7 @@ use std::fmt::Debug; use std::fmt::Formatter; +use std::fmt::Write; use std::hash::Hash; use std::sync::Arc; use std::sync::LazyLock; @@ -514,6 +515,46 @@ impl FSSTArray { pub fn compressor(&self) -> &Compressor { self.compressor.as_ref() } + + /// Format the FSST symbol table as a human-readable table. + /// + /// Each row shows the symbol code, byte length, hex bytes, and a text representation + /// where printable ASCII is shown as-is and other bytes are escaped. + pub fn format_symbol_table(&self) -> String { + fn escape_byte(b: u8) -> String { + match b { + b'\n' => "\\n".to_string(), + b'\t' => "\\t".to_string(), + b'\r' => "\\r".to_string(), + b if b.is_ascii_graphic() || b == b' ' => (b as char).to_string(), + b => format!("\\x{b:02x}"), + } + } + + let symbols = self.symbols.as_slice(); + let lengths = self.symbol_lengths.as_slice(); + let n = symbols.len(); + + let mut out = format!("FSST Symbol Table ({n} symbols)\n"); + let _ = writeln!(out, "{:>4} | {:>3} | {:<23} | Text", "Code", "Len", "Hex"); + let _ = writeln!(out, "-----+-----+-------------------------+------"); + + for (code, (sym, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { + let bytes = sym.to_u64().to_le_bytes(); + let actual = &bytes[..len as usize]; + + let hex: String = actual + .iter() + .map(|b| format!("{b:02x}")) + .collect::>() + .join(" "); + + let text: String = actual.iter().map(|&b| escape_byte(b)).collect(); + + let _ = writeln!(out, "{code:>4} | {len:>3} | {hex:<23} | {text}"); + } + out + } } impl ValidityChild for FSSTVTable { @@ -544,7 +585,9 @@ mod test { use crate::FSSTVTable; use crate::array::FSSTMetadata; + use crate::fsst_compress; use crate::fsst_compress_iter; + use crate::fsst_train_compressor; #[cfg_attr(miri, ignore)] #[test] @@ -628,4 +671,37 @@ mod test { }) .unwrap() } + + #[test] + fn test_format_symbol_table_urls() { + use vortex_array::arrays::VarBinArray; + + // Clickbench-style URL data + let urls: Vec> = vec![ + Some(b"http://smeshariki.ru/GameMain.aspx?id=123&tab=1#ref=123"), + Some(b"http://smeshariki.ru/index.php?id=456&tab=2#ref=456"), + Some(b"http://auto.ru/cars/used/sale?id=789&tab=3#ref=789"), + Some(b"http://komme.ru/search?id=1000&tab=4#ref=1000"), + Some(b"http://yandex.ru/news/article?id=2000&tab=5#ref=2000"), + Some(b"http://mail.ru/user/profile?id=3000&tab=6#ref=3000"), + Some(b"http://smeshariki.ru/catalog/item?id=4000&tab=7#ref=4000"), + Some(b"http://auto.ru/forum/thread?id=5000&tab=8#ref=5000"), + Some(b"http://vk.com/photo/album?id=6000&tab=9#ref=6000"), + Some(b"http://livejournal.com/blog/post?id=7000&tab=10#ref=7000"), + ]; + + let varbin = VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable)); + let compressor = fsst_train_compressor(&varbin); + let fsst_array = fsst_compress(&varbin, &compressor); + + let table = fsst_array.format_symbol_table(); + eprintln!("{table}"); + + // Verify basic structure + assert!(table.starts_with("FSST Symbol Table (")); + assert!(table.contains("Code")); + assert!(table.contains("Len")); + assert!(table.contains("Hex")); + assert!(table.contains("Text")); + } } diff --git a/vendor/fsst-rs/Cargo.toml b/vendor/fsst-rs/Cargo.toml new file mode 100644 index 00000000000..666a2166c7a --- /dev/null +++ b/vendor/fsst-rs/Cargo.toml @@ -0,0 +1,55 @@ +[package] +name = "fsst-rs" +version = "0.5.6" +edition = "2024" +rust-version = "1.86.0" +authors = ["SpiralDB Developers "] +description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression" +license = "Apache-2.0" +repository = "https://github.com/spiraldb/fsst" +readme = "README.md" +keywords = ["compression", "fsst"] +categories = ["compression"] + +[lib] +name = "fsst" +path = "src/lib.rs" + +[lints.clippy] +or_fun_call = "deny" + +[lints.clippy.all] +level = "deny" +priority = -1 + +[lints.clippy.if_then_some_else_none] +level = "deny" +priority = 0 + +[lints.clippy.mem_forget] +level = "deny" +priority = 0 + +[lints.clippy.panic_in_result_fn] +level = "deny" +priority = 0 + +[lints.clippy.same_name_method] +level = "deny" +priority = 0 + +[lints.clippy.tests_outside_test_module] +level = "deny" +priority = 0 + +[lints.clippy.unwrap_in_result] +level = "deny" +priority = 0 + +[lints.clippy.use_debug] +level = "deny" +priority = 0 + +[lints.rust] +missing_docs = "deny" +warnings = "deny" diff --git a/vendor/fsst-rs/LICENSE b/vendor/fsst-rs/LICENSE new file mode 100644 index 00000000000..261eeb9e9f8 --- /dev/null +++ b/vendor/fsst-rs/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/vendor/fsst-rs/src/builder.rs b/vendor/fsst-rs/src/builder.rs new file mode 100644 index 00000000000..f2b911c8b5b --- /dev/null +++ b/vendor/fsst-rs/src/builder.rs @@ -0,0 +1,993 @@ +//! Functions and types used for building a [`Compressor`] from a corpus of text. +//! +//! This module implements the logic from Algorithm 3 of the [FSST Paper]. +//! +//! [FSST Paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + +use std::cmp::Ordering; +use std::collections::BinaryHeap; + +use crate::Code; +use crate::Compressor; +use crate::FSST_CODE_BASE; +use crate::FSST_CODE_MASK; +use crate::Symbol; +use crate::advance_8byte_word; +use crate::compare_masked; +use crate::lossy_pht::LossyPHT; + +/// Bitmap that only works for values up to 512 +#[derive(Clone, Copy, Debug, Default)] +struct CodesBitmap { + codes: [u64; 8], +} + +assert_sizeof!(CodesBitmap => 64); + +impl CodesBitmap { + /// Set the indicated bit. Must be between 0 and [`FSST_CODE_MASK`][crate::FSST_CODE_MASK]. + pub(crate) fn set(&mut self, index: usize) { + debug_assert!( + index <= FSST_CODE_MASK as usize, + "code cannot exceed {FSST_CODE_MASK}" + ); + + let map = index >> 6; + self.codes[map] |= 1 << (index % 64); + } + + /// Check if `index` is present in the bitmap + pub(crate) fn is_set(&self, index: usize) -> bool { + debug_assert!( + index <= FSST_CODE_MASK as usize, + "code cannot exceed {FSST_CODE_MASK}" + ); + + let map = index >> 6; + self.codes[map] & (1 << (index % 64)) != 0 + } + + /// Get all codes set in this bitmap + pub(crate) fn codes(&self) -> CodesIterator<'_> { + CodesIterator { + inner: self, + index: 0, + block: self.codes[0], + reference: 0, + } + } + + /// Clear the bitmap of all entries. + pub(crate) fn clear(&mut self) { + self.codes[0] = 0; + self.codes[1] = 0; + self.codes[2] = 0; + self.codes[3] = 0; + self.codes[4] = 0; + self.codes[5] = 0; + self.codes[6] = 0; + self.codes[7] = 0; + } +} + +struct CodesIterator<'a> { + inner: &'a CodesBitmap, + index: usize, + block: u64, + reference: usize, +} + +impl Iterator for CodesIterator<'_> { + type Item = u16; + + fn next(&mut self) -> Option { + // If current is zero, advance to next non-zero block + while self.block == 0 { + self.index += 1; + if self.index >= 8 { + return None; + } + self.block = self.inner.codes[self.index]; + self.reference = self.index * 64; + } + + // Find the next set bit in the current block. + let position = self.block.trailing_zeros() as usize; + let code = self.reference + position; + + if code >= 511 { + return None; + } + + // The next iteration will calculate with reference to the returned code + 1 + self.reference = code + 1; + self.block = if position == 63 { + 0 + } else { + self.block >> (1 + position) + }; + + Some(code as u16) + } +} + +#[derive(Debug, Clone)] +struct Counter { + /// Frequency count for each code. + counts1: Vec, + + /// Frequency count for each code-pair. + counts2: Vec, + + /// Bitmap index for codes that appear in counts1 + code1_index: CodesBitmap, + + /// Bitmap index of pairs that have been set. + /// + /// `pair_index[code1].codes()` yields an iterator that can + /// be used to find all possible codes that follow `codes1`. + pair_index: Vec, +} + +const COUNTS1_SIZE: usize = (FSST_CODE_MASK + 1) as usize; + +// NOTE: in Rust, creating a 1D vector of length N^2 is ~4x faster than creating a 2-D vector, +// because `vec!` has a specialization for zero. +// +// We also include +1 extra row at the end so that we can do writes into the counters without a branch +// for the first iteration. +const COUNTS2_SIZE: usize = COUNTS1_SIZE * COUNTS1_SIZE; + +impl Counter { + fn new() -> Self { + let mut counts1 = Vec::with_capacity(COUNTS1_SIZE); + let mut counts2 = Vec::with_capacity(COUNTS2_SIZE); + // SAFETY: all accesses to the vector go through the bitmap to ensure no uninitialized + // data is ever read from these vectors. + unsafe { + counts1.set_len(COUNTS1_SIZE); + counts2.set_len(COUNTS2_SIZE); + } + + Self { + counts1, + counts2, + code1_index: CodesBitmap::default(), + pair_index: vec![CodesBitmap::default(); COUNTS1_SIZE], + } + } + + #[inline] + fn record_count1(&mut self, code1: u16) { + // If not set, we want to start at one. + let base = if self.code1_index.is_set(code1 as usize) { + self.counts1[code1 as usize] + } else { + 0 + }; + + self.counts1[code1 as usize] = base + 1; + self.code1_index.set(code1 as usize); + } + + #[inline] + fn record_count2(&mut self, code1: u16, code2: u16) { + debug_assert!(code1 == FSST_CODE_MASK || self.code1_index.is_set(code1 as usize)); + debug_assert!(self.code1_index.is_set(code2 as usize)); + + let idx = (code1 as usize) * COUNTS1_SIZE + (code2 as usize); + if self.pair_index[code1 as usize].is_set(code2 as usize) { + self.counts2[idx] += 1; + } else { + self.counts2[idx] = 1; + } + self.pair_index[code1 as usize].set(code2 as usize); + } + + #[inline] + fn count1(&self, code1: u16) -> usize { + debug_assert!(self.code1_index.is_set(code1 as usize)); + + self.counts1[code1 as usize] + } + + #[inline] + fn count2(&self, code1: u16, code2: u16) -> usize { + debug_assert!(self.code1_index.is_set(code1 as usize)); + debug_assert!(self.code1_index.is_set(code2 as usize)); + debug_assert!(self.pair_index[code1 as usize].is_set(code2 as usize)); + + let idx = (code1 as usize) * 512 + (code2 as usize); + self.counts2[idx] + } + + /// Returns an ordered iterator over the codes that were observed + /// in a call to [`Self::count1`]. + fn first_codes(&self) -> CodesIterator<'_> { + self.code1_index.codes() + } + + /// Returns an iterator over the codes that have been observed + /// to follow `code1`. + /// + /// This is the set of all values `code2` where there was + /// previously a call to `self.record_count2(code1, code2)`. + fn second_codes(&self, code1: u16) -> CodesIterator<'_> { + self.pair_index[code1 as usize].codes() + } + + /// Clear the counters. + /// Note that this just touches the bitmaps and sets them all to invalid. + fn clear(&mut self) { + self.code1_index.clear(); + for index in &mut self.pair_index { + index.clear(); + } + } +} + +/// Entrypoint for building a new `Compressor`. +pub struct CompressorBuilder { + /// Table mapping codes to symbols. + /// + /// The entries 0-255 are setup in some other way here + symbols: Vec, + + /// The number of entries in the symbol table that have been populated, not counting + /// the escape values. + n_symbols: u8, + + /// Counts for number of symbols of each length. + /// + /// `len_histogram[len-1]` = count of the symbols of length `len`. + len_histogram: [u8; 8], + + /// Inverted index mapping 1-byte symbols to codes. + /// + /// This is only used for building, not used by the final `Compressor`. + codes_one_byte: Vec, + + /// Inverted index mapping 2-byte symbols to codes + codes_two_byte: Vec, + + /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more + lossy_pht: LossyPHT, +} + +impl CompressorBuilder { + /// Create a new builder. + pub fn new() -> Self { + // NOTE: `vec!` has a specialization for building a new vector of `0u64`. Because Symbol and u64 + // have the same bit pattern, we can allocate as u64 and transmute. If we do `vec![Symbol::EMPTY; N]`, + // that will create a new Vec and call `Symbol::EMPTY.clone()` `N` times which is considerably slower. + let symbols = vec![0u64; 511]; + + // SAFETY: transmute safety assured by the compiler. + let symbols: Vec = unsafe { std::mem::transmute(symbols) }; + + let mut table = Self { + symbols, + n_symbols: 0, + len_histogram: [0; 8], + codes_two_byte: Vec::with_capacity(65_536), + codes_one_byte: Vec::with_capacity(512), + lossy_pht: LossyPHT::new(), + }; + + // Populate the escape byte entries. + for byte in 0..=255 { + let symbol = Symbol::from_u8(byte); + table.symbols[byte as usize] = symbol; + } + + // Fill codes_one_byte with pseudocodes for each byte. + for byte in 0..=255 { + // Push pseudocode for single-byte escape. + table.codes_one_byte.push(Code::new_escape(byte)); + } + + // Fill codes_two_byte with pseudocode of first byte + for idx in 0..=65_535 { + table.codes_two_byte.push(Code::new_escape(idx as u8)); + } + + table + } +} + +impl Default for CompressorBuilder { + fn default() -> Self { + Self::new() + } +} + +impl CompressorBuilder { + /// Attempt to insert a new symbol at the end of the table. + /// + /// # Panics + /// + /// Panics if the table is already full. + /// + /// # Returns + /// + /// Returns true if the symbol was inserted successfully, or false if it conflicted + /// with an existing symbol. + pub fn insert(&mut self, symbol: Symbol, len: usize) -> bool { + assert!(self.n_symbols < 255, "cannot insert into full symbol table"); + assert_eq!(len, symbol.len(), "provided len must equal symbol.len()"); + + if len == 2 { + // Check if this 2-byte symbol is already in the table. + if self.codes_two_byte[symbol.first2() as usize].extended_code() >= FSST_CODE_BASE { + return false; + } + self.codes_two_byte[symbol.first2() as usize] = + Code::new_symbol_building(self.n_symbols, 2); + } else if len == 1 { + // Check if this 1-byte symbol is already in the table. + if self.codes_one_byte[symbol.first_byte() as usize].extended_code() >= FSST_CODE_BASE { + return false; + } + self.codes_one_byte[symbol.first_byte() as usize] = + Code::new_symbol_building(self.n_symbols, 1); + } else { + // Symbols of 3 or more bytes go into the hash table + if !self.lossy_pht.insert(symbol, len, self.n_symbols) { + return false; + } + } + + // Increment length histogram. + self.len_histogram[len - 1] += 1; + + // Insert successfully stored symbol at end of the symbol table + // Note the rescaling from range [0-254] -> [256, 510]. + self.symbols[256 + (self.n_symbols as usize)] = symbol; + self.n_symbols += 1; + true + } + + /// Clear all set items from the compressor. + /// + /// This is considerably faster than building a new Compressor from scratch for each + /// iteration of the `train` loop. + fn clear(&mut self) { + // Eliminate every observed code from the table. + for code in 0..(256 + self.n_symbols as usize) { + let symbol = self.symbols[code]; + if symbol.len() == 1 { + // Reset the entry from the codes_one_byte array. + self.codes_one_byte[symbol.first_byte() as usize] = + Code::new_escape(symbol.first_byte()); + } else if symbol.len() == 2 { + // Reset the entry from the codes_two_byte array. + self.codes_two_byte[symbol.first2() as usize] = + Code::new_escape(symbol.first_byte()); + } else { + // Clear the hashtable entry + self.lossy_pht.remove(symbol); + } + } + + // Reset len histogram + for i in 0..=7 { + self.len_histogram[i] = 0; + } + + self.n_symbols = 0; + } + + /// Finalizing the table is done once building is complete to prepare for efficient + /// compression. + /// + /// When we finalize the table, the following modifications are made in-place: + /// + /// 1. The codes are renumbered so that all symbols are ordered by length (order 23456781). + /// During this process, the two byte symbols are separated into a byte_lim and a suffix_lim, + /// so we know that we don't need to check the suffix limitations instead. + /// 2. The 1-byte symbols index is merged into the 2-byte symbols index to allow for use of only + /// a single index in front of the hash table. + /// + /// # Returns + /// + /// Returns the `suffix_lim`, which is the index of the two-byte code before where we know + /// there are no longer suffixies in the symbol table. + /// + /// Also returns the lengths vector, which is of length `n_symbols` and contains the + /// length for each of the values. + fn finalize(&mut self) -> (u8, Vec) { + // Create a cumulative sum of each of the elements of the input line numbers. + // Do a map that includes the previously seen value as well. + // Regroup symbols based on their lengths. + // Space at the end of the symbol table reserved for the one-byte codes. + let byte_lim = self.n_symbols - self.len_histogram[0]; + + // Start code for each length. + // Length 1: at the end of symbol table. + // Length 2: starts at 0. Split into before/after suffixLim. + let mut codes_by_length = [0u8; 8]; + codes_by_length[0] = byte_lim; + codes_by_length[1] = 0; + + // codes for lengths 3..=8 start where the previous ones end. + for i in 1..7 { + codes_by_length[i + 1] = codes_by_length[i] + self.len_histogram[i]; + } + + // no_suffix_code is the lowest code for a symbol that does not have a longer 3+ byte + // suffix in the table. + // This value starts at 0 and extends up. + let mut no_suffix_code = 0; + + // The codes that do not have a suffix begin just before the range of the 3-byte codes. + let mut has_suffix_code = codes_by_length[2]; + + // Assign each symbol a new code ordered by lengths, in the order + // 2(no suffix) | 2 (suffix) | 3 | 4 | 5 | 6 | 7 | 8 | 1 + let mut new_codes = [0u8; FSST_CODE_BASE as usize]; + + let mut symbol_lens = [0u8; FSST_CODE_BASE as usize]; + + for i in 0..(self.n_symbols as usize) { + let symbol = self.symbols[256 + i]; + let len = symbol.len(); + if len == 2 { + let has_suffix = self + .symbols + .iter() + .skip(FSST_CODE_BASE as usize) + .enumerate() + .any(|(k, other)| i != k && symbol.first2() == other.first2()); + + if has_suffix { + // Symbols that have a longer suffix are inserted at the end of the 2-byte range + has_suffix_code -= 1; + new_codes[i] = has_suffix_code; + } else { + // Symbols that do not have a longer suffix are inserted at the start of + // the 2-byte range. + new_codes[i] = no_suffix_code; + no_suffix_code += 1; + } + } else { + // Assign new code based on the next code available for the given length symbol + new_codes[i] = codes_by_length[len - 1]; + codes_by_length[len - 1] += 1; + } + + // Write the symbol into the front half of the symbol table. + // We are reusing the space that was previously occupied by escapes. + self.symbols[new_codes[i] as usize] = symbol; + symbol_lens[new_codes[i] as usize] = len as u8; + } + + // Truncate the symbol table to only include the "true" symbols. + self.symbols.truncate(self.n_symbols as usize); + + // Rewrite the codes_one_byte table to point at the new code values. + // Replace pseudocodes with escapes. + for byte in 0..=255 { + let one_byte = self.codes_one_byte[byte]; + if one_byte.extended_code() >= FSST_CODE_BASE { + let new_code = new_codes[one_byte.code() as usize]; + self.codes_one_byte[byte] = Code::new_symbol(new_code, 1); + } else { + // After finalize: codes_one_byte contains the unused value + self.codes_one_byte[byte] = Code::UNUSED; + } + } + + // Rewrite the codes_two_byte table to point at the new code values. + // Replace pseudocodes with escapes. + for two_bytes in 0..=65_535 { + let two_byte = self.codes_two_byte[two_bytes]; + if two_byte.extended_code() >= FSST_CODE_BASE { + let new_code = new_codes[two_byte.code() as usize]; + self.codes_two_byte[two_bytes] = Code::new_symbol(new_code, 2); + } else { + // The one-byte code for the given code number here... + self.codes_two_byte[two_bytes] = self.codes_one_byte[two_bytes & 0xFF]; + } + } + + // Reset values in the hash table as well. + self.lossy_pht.renumber(&new_codes); + + // Pre-compute the lengths + let mut lengths = Vec::with_capacity(self.n_symbols as usize); + for symbol in &self.symbols { + lengths.push(symbol.len() as u8); + } + + (has_suffix_code, lengths) + } + + /// Build into the final hash table. + pub fn build(mut self) -> Compressor { + // finalize the symbol table by inserting the codes_twobyte values into + // the relevant parts of the `codes_onebyte` set. + + let (has_suffix_code, lengths) = self.finalize(); + + Compressor { + symbols: self.symbols, + lengths, + n_symbols: self.n_symbols, + has_suffix_code, + codes_two_byte: self.codes_two_byte, + lossy_pht: self.lossy_pht, + } + } +} + +/// The number of generations used for training. This is taken from the [FSST paper]. +/// +/// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf +#[cfg(not(miri))] +const GENERATIONS: [usize; 5] = [8usize, 38, 68, 98, 128]; +#[cfg(miri)] +const GENERATIONS: [usize; 3] = [8usize, 38, 128]; + +const FSST_SAMPLETARGET: usize = 1 << 14; +const FSST_SAMPLEMAX: usize = 1 << 15; +const FSST_SAMPLELINE: usize = 512; + +/// Create a sample from a set of strings in the input. +/// +/// Sample is constructing by copying "chunks" from the `str_in`s into the `sample_buf`, the +/// returned slices are pointers into the `sample_buf`. +/// +/// SAFETY: sample_buf must be >= FSST_SAMPLEMAX bytes long. Providing something less may cause unexpected failures. +#[allow(clippy::ptr_arg)] +fn make_sample<'a, 'b: 'a>(sample_buf: &'a mut Vec, str_in: &Vec<&'b [u8]>) -> Vec<&'a [u8]> { + assert!( + sample_buf.capacity() >= FSST_SAMPLEMAX, + "sample_buf.len() < FSST_SAMPLEMAX" + ); + + let mut sample: Vec<&[u8]> = Vec::new(); + + let tot_size: usize = str_in.iter().map(|s| s.len()).sum(); + if tot_size < FSST_SAMPLETARGET { + return str_in.clone(); + } + + let mut sample_rnd = fsst_hash(4637947); + let sample_lim = FSST_SAMPLETARGET; + let mut sample_buf_offset: usize = 0; + + while sample_buf_offset < sample_lim { + sample_rnd = fsst_hash(sample_rnd); + let line_nr = (sample_rnd as usize) % str_in.len(); + + // Find the first non-empty chunk starting at line_nr, wrapping around if + // necessary. + let Some(line) = (line_nr..str_in.len()) + .chain(0..line_nr) + .map(|line_nr| str_in[line_nr]) + .find(|line| !line.is_empty()) + else { + return sample; + }; + + let chunks = 1 + ((line.len() - 1) / FSST_SAMPLELINE); + sample_rnd = fsst_hash(sample_rnd); + let chunk = FSST_SAMPLELINE * ((sample_rnd as usize) % chunks); + + let len = FSST_SAMPLELINE.min(line.len() - chunk); + + sample_buf.extend_from_slice(&line[chunk..chunk + len]); + + // SAFETY: this is the data we just placed into `sample_buf` in the line above. + let slice = + unsafe { std::slice::from_raw_parts(sample_buf.as_ptr().add(sample_buf_offset), len) }; + + sample.push(slice); + + sample_buf_offset += len; + } + + sample +} + +/// Hash function used in various components of the library. +/// +/// This is equivalent to the FSST_HASH macro from the C++ implementation. +#[inline] +pub(crate) fn fsst_hash(value: u64) -> u64 { + value.wrapping_mul(2971215073) ^ value.wrapping_shr(15) +} + +impl Compressor { + /// Build and train a `Compressor` from a sample corpus of text. + /// + /// This function implements the generational algorithm described in the [FSST paper] Section + /// 4.3. Starting with an empty symbol table, it iteratively compresses the corpus, then attempts + /// to merge symbols when doing so would yield better compression than leaving them unmerged. The + /// resulting table will have at most 255 symbols (the 256th symbol is reserved for the escape + /// code). + /// + /// [FSST paper]: https://www.vldb.org/pvldb/vol13/p2649-boncz.pdf + pub fn train(values: &Vec<&[u8]>) -> Self { + let mut builder = CompressorBuilder::new(); + + if values.is_empty() { + return builder.build(); + } + + let mut counters = Counter::new(); + let mut sample_memory = Vec::with_capacity(FSST_SAMPLEMAX); + let mut pqueue = BinaryHeap::with_capacity(65_536); + + let sample = make_sample(&mut sample_memory, values); + for sample_frac in GENERATIONS { + for (i, line) in sample.iter().enumerate() { + if sample_frac < 128 && ((fsst_hash(i as u64) & 127) as usize) > sample_frac { + continue; + } + + builder.compress_count(line, &mut counters); + } + + // Clear the heap before we use it again + pqueue.clear(); + builder.optimize(&counters, sample_frac, &mut pqueue); + counters.clear(); + } + + builder.build() + } +} + +impl CompressorBuilder { + /// Find the longest symbol using the hash table and the codes_one_byte and codes_two_byte indexes. + fn find_longest_symbol(&self, word: u64) -> Code { + // Probe the hash table first to see if we have a long match + let entry = self.lossy_pht.lookup(word); + let ignored_bits = entry.ignored_bits; + + // If the entry is valid, return the code + if !entry.is_unused() && compare_masked(word, entry.symbol.to_u64(), ignored_bits) { + return entry.code; + } + + // Try and match first two bytes + let twobyte = self.codes_two_byte[word as u16 as usize]; + if twobyte.extended_code() >= FSST_CODE_BASE { + return twobyte; + } + + // Fall back to single-byte match + self.codes_one_byte[word as u8 as usize] + } + + /// Compress the text using the current symbol table. Count the code occurrences + /// and code-pair occurrences, calculating total gain using the current compressor. + /// + /// NOTE: this is largely an unfortunate amount of copy-paste from `compress`, just to make sure + /// we can do all the counting in a single pass. + fn compress_count(&self, sample: &[u8], counter: &mut Counter) -> usize { + let mut gain = 0; + if sample.is_empty() { + return gain; + } + + let mut in_ptr = sample.as_ptr(); + + // SAFETY: `end` will point just after the end of the `plaintext` slice. + let in_end = unsafe { in_ptr.byte_add(sample.len()) }; + let in_end_sub8 = in_end as usize - 8; + + let mut prev_code: u16 = FSST_CODE_MASK; + + while (in_ptr as usize) < (in_end_sub8) { + // SAFETY: ensured in-bounds by loop condition. + let word: u64 = unsafe { std::ptr::read_unaligned(in_ptr as *const u64) }; + let code = self.find_longest_symbol(word); + let code_u16 = code.extended_code(); + + // Gain increases by the symbol length if a symbol matches, or 0 + // if an escape is emitted. + gain += (code.len() as usize) - ((code_u16 < 256) as usize); + + // Record the single and pair counts + counter.record_count1(code_u16); + counter.record_count2(prev_code, code_u16); + + // Also record the count for just extending by a single byte, but only if + // the symbol is not itself a single byte. + if code.len() > 1 { + let code_first_byte = self.symbols[code_u16 as usize].first_byte() as u16; + counter.record_count1(code_first_byte); + counter.record_count2(prev_code, code_first_byte); + } + + // SAFETY: pointer bound is checked in loop condition before any access is made. + in_ptr = unsafe { in_ptr.byte_add(code.len() as usize) }; + + prev_code = code_u16; + } + + let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; + assert!( + remaining_bytes.is_positive(), + "in_ptr exceeded in_end, should not be possible" + ); + let remaining_bytes = remaining_bytes as usize; + + // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above, + // but shift data out of this word rather than advancing an input pointer and potentially reading + // unowned memory + let mut bytes = [0u8; 8]; + unsafe { + // SAFETY: it is safe to read up to remaining_bytes from in_ptr, and remaining_bytes + // will be <= 8 bytes. + std::ptr::copy_nonoverlapping(in_ptr, bytes.as_mut_ptr(), remaining_bytes); + } + let mut last_word = u64::from_le_bytes(bytes); + + let mut remaining_bytes = remaining_bytes; + + while remaining_bytes > 0 { + // SAFETY: ensured in-bounds by loop condition. + let code = self.find_longest_symbol(last_word); + let code_u16 = code.extended_code(); + + // Gain increases by the symbol length if a symbol matches, or 0 + // if an escape is emitted. + gain += (code.len() as usize) - ((code_u16 < 256) as usize); + + // Record the single and pair counts + counter.record_count1(code_u16); + counter.record_count2(prev_code, code_u16); + + // Also record the count for just extending by a single byte, but only if + // the symbol is not itself a single byte. + if code.len() > 1 { + let code_first_byte = self.symbols[code_u16 as usize].first_byte() as u16; + counter.record_count1(code_first_byte); + counter.record_count2(prev_code, code_first_byte); + } + + // Advance our last_word "input pointer" by shifting off the covered values. + let advance = code.len() as usize; + remaining_bytes -= advance; + last_word = advance_8byte_word(last_word, advance); + + prev_code = code_u16; + } + + gain + } + + /// Using a set of counters and the existing set of symbols, build a new + /// set of symbols/codes that optimizes the gain over the distribution in `counter`. + fn optimize( + &mut self, + counters: &Counter, + sample_frac: usize, + pqueue: &mut BinaryHeap, + ) { + for code1 in counters.first_codes() { + let symbol1 = self.symbols[code1 as usize]; + let symbol1_len = symbol1.len(); + let count = counters.count1(code1); + + // From the c++ impl: + // "improves both compression speed (less candidates), but also quality!!" + if count < (5 * sample_frac / 128) { + continue; + } + + let mut gain = count * symbol1_len; + // NOTE: use heuristic from C++ implementation to boost the gain of single-byte symbols. + // This helps to reduce exception counts. + if code1 < 256 { + gain *= 8; + } + + pqueue.push(Candidate { + symbol: symbol1, + gain, + }); + + // Skip merges on last round, or when symbol cannot be extended. + if sample_frac >= 128 || symbol1_len == 8 { + continue; + } + + for code2 in counters.second_codes(code1) { + let symbol2 = self.symbols[code2 as usize]; + + // If merging would yield a symbol of length greater than 8, skip. + if symbol1_len + symbol2.len() > 8 { + continue; + } + let new_symbol = symbol1.concat(symbol2); + let gain = counters.count2(code1, code2) * new_symbol.len(); + + pqueue.push(Candidate { + symbol: new_symbol, + gain, + }) + } + } + + // clear self in advance of inserting the symbols. + self.clear(); + + // Pop the 255 best symbols. + let mut n_symbols = 0; + while !pqueue.is_empty() && n_symbols < 255 { + let candidate = pqueue.pop().unwrap(); + if self.insert(candidate.symbol, candidate.symbol.len()) { + n_symbols += 1; + } + } + } +} + +/// A candidate for inclusion in a symbol table. +/// +/// This is really only useful for the `optimize` step of training. +#[derive(Copy, Clone, Debug)] +struct Candidate { + gain: usize, + symbol: Symbol, +} + +impl Candidate { + fn comparable_form(&self) -> (usize, usize) { + (self.gain, self.symbol.len()) + } +} + +impl Eq for Candidate {} + +impl PartialEq for Candidate { + fn eq(&self, other: &Self) -> bool { + self.comparable_form().eq(&other.comparable_form()) + } +} + +impl PartialOrd for Candidate { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +impl Ord for Candidate { + fn cmp(&self, other: &Self) -> Ordering { + let self_ord = (self.gain, self.symbol.len()); + let other_ord = (other.gain, other.symbol.len()); + + self_ord.cmp(&other_ord) + } +} + +#[cfg(test)] +mod test { + use crate::Compressor; + use crate::ESCAPE_CODE; + use crate::builder::CodesBitmap; + + #[test] + fn test_builder() { + // Train a Compressor on the toy string + let text = b"hello hello hello hello hello"; + + // count of 5 is the cutoff for including a symbol in the table. + let table = Compressor::train(&vec![text, text, text, text, text]); + + // Use the table to compress a string, see the values + let compressed = table.compress(text); + + // Ensure that the compressed string has no escape bytes + assert!(compressed.iter().all(|b| *b != ESCAPE_CODE)); + + // Ensure that we can compress a string with no values seen at training time, with escape bytes + let compressed = table.compress("xyz123".as_bytes()); + let decompressed = table.decompressor().decompress(&compressed); + assert_eq!(&decompressed, b"xyz123"); + assert_eq!( + compressed, + vec![ + ESCAPE_CODE, + b'x', + ESCAPE_CODE, + b'y', + ESCAPE_CODE, + b'z', + ESCAPE_CODE, + b'1', + ESCAPE_CODE, + b'2', + ESCAPE_CODE, + b'3', + ] + ); + } + + #[test] + fn test_bitmap() { + let mut map = CodesBitmap::default(); + map.set(10); + map.set(100); + map.set(500); + + let codes: Vec = map.codes().collect(); + assert_eq!(codes, vec![10u16, 100, 500]); + + // empty case + let map = CodesBitmap::default(); + assert!(map.codes().collect::>().is_empty()); + + // edge case: first bit in each block is set + let mut map = CodesBitmap::default(); + (0..8).for_each(|i| map.set(64 * i)); + assert_eq!( + map.codes().collect::>(), + (0u16..8).map(|i| 64 * i).collect::>(), + ); + + // Full bitmap case. There are only 512 values, so test them all + let mut map = CodesBitmap::default(); + for i in 0..512 { + map.set(i); + } + assert_eq!( + map.codes().collect::>(), + (0u16..511u16).collect::>() + ); + } + + #[test] + #[should_panic(expected = "code cannot exceed")] + fn test_bitmap_invalid() { + let mut map = CodesBitmap::default(); + map.set(512); + } + + #[test] + fn test_no_duplicate_symbols() { + // Train on data that is likely to produce duplicate 1-byte and 2-byte candidates. + let text = b"aababcabcdabcde"; + let corpus: Vec<&[u8]> = std::iter::repeat_n(text.as_slice(), 100).collect(); + let compressor = Compressor::train(&corpus); + + let symbols = compressor.symbol_table(); + let lengths = compressor.symbol_lengths(); + + // Collect all 1-byte symbols and check for duplicates. + let one_byte: Vec = symbols + .iter() + .zip(lengths.iter()) + .filter(|&(_, &len)| len == 1) + .map(|(sym, _)| sym.first_byte()) + .collect(); + let mut one_byte_sorted = one_byte.clone(); + one_byte_sorted.sort(); + one_byte_sorted.dedup(); + assert_eq!( + one_byte.len(), + one_byte_sorted.len(), + "duplicate 1-byte symbols found" + ); + + // Collect all 2-byte symbols and check for duplicates. + let two_byte: Vec = symbols + .iter() + .zip(lengths.iter()) + .filter(|&(_, &len)| len == 2) + .map(|(sym, _)| sym.first2()) + .collect(); + let mut two_byte_sorted = two_byte.clone(); + two_byte_sorted.sort(); + two_byte_sorted.dedup(); + assert_eq!( + two_byte.len(), + two_byte_sorted.len(), + "duplicate 2-byte symbols found" + ); + } +} diff --git a/vendor/fsst-rs/src/lib.rs b/vendor/fsst-rs/src/lib.rs new file mode 100644 index 00000000000..33d843d48d8 --- /dev/null +++ b/vendor/fsst-rs/src/lib.rs @@ -0,0 +1,970 @@ +#![doc = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"] +#![cfg(target_endian = "little")] + +/// Throw a compiler error if a type isn't guaranteed to have a specific size in bytes. +macro_rules! assert_sizeof { + ($typ:ty => $size_in_bytes:expr) => { + const _: [u8; $size_in_bytes] = [0; std::mem::size_of::<$typ>()]; + }; +} + +use std::fmt::Debug; +use std::fmt::Formatter; +use std::mem::MaybeUninit; + +use lossy_pht::LossyPHT; + +mod builder; +mod lossy_pht; + +pub use builder::*; + +/// `Symbol`s are small (up to 8-byte) segments of strings, stored in a [`Compressor`][`crate::Compressor`] and +/// identified by an 8-bit code. +#[derive(Copy, Clone, PartialEq, Eq, Hash)] +pub struct Symbol(u64); + +assert_sizeof!(Symbol => 8); + +impl Symbol { + /// Zero value for `Symbol`. + pub const ZERO: Self = Self::zero(); + + /// Constructor for a `Symbol` from an 8-element byte slice. + pub fn from_slice(slice: &[u8; 8]) -> Self { + let num: u64 = u64::from_le_bytes(*slice); + + Self(num) + } + + /// Return a zero symbol + const fn zero() -> Self { + Self(0) + } + + /// Create a new single-byte symbol + pub fn from_u8(value: u8) -> Self { + Self(value as u64) + } +} + +impl Symbol { + /// Calculate the length of the symbol in bytes. Always a value between 1 and 8. + /// + /// Each symbol has the capacity to hold up to 8 bytes of data, but the symbols + /// can contain fewer bytes, padded with 0x00. There is a special case of a symbol + /// that holds the byte 0x00. In that case, the symbol contains `0x0000000000000000` + /// but we want to interpret that as a one-byte symbol containing `0x00`. + #[allow(clippy::len_without_is_empty)] + pub fn len(self) -> usize { + let numeric = self.0; + // For little-endian platforms, this counts the number of *trailing* zeros + let null_bytes = (numeric.leading_zeros() >> 3) as usize; + + // Special case handling of a symbol with all-zeros. This is actually + // a 1-byte symbol containing 0x00. + let len = size_of::() - null_bytes; + if len == 0 { 1 } else { len } + } + + /// Returns the Symbol's inner representation. + #[inline] + pub fn to_u64(self) -> u64 { + self.0 + } + + /// Get the first byte of the symbol as a `u8`. + /// + /// If the symbol is empty, this will return the zero byte. + #[inline] + pub fn first_byte(self) -> u8 { + self.0 as u8 + } + + /// Get the first two bytes of the symbol as a `u16`. + /// + /// If the Symbol is one or zero bytes, this will return `0u16`. + #[inline] + pub fn first2(self) -> u16 { + self.0 as u16 + } + + /// Get the first three bytes of the symbol as a `u64`. + /// + /// If the Symbol is one or zero bytes, this will return `0u64`. + #[inline] + pub fn first3(self) -> u64 { + self.0 & 0xFF_FF_FF + } + + /// Return a new `Symbol` by logically concatenating ourselves with another `Symbol`. + pub fn concat(self, other: Self) -> Self { + assert!( + self.len() + other.len() <= 8, + "cannot build symbol with length > 8" + ); + + let self_len = self.len(); + + Self((other.0 << (8 * self_len)) | self.0) + } +} + +impl Debug for Symbol { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "[")?; + + let slice = &self.0.to_le_bytes()[0..self.len()]; + for c in slice.iter().map(|c| *c as char) { + if ('!'..='~').contains(&c) { + write!(f, "{c}")?; + } else if c == '\n' { + write!(f, " \\n ")?; + } else if c == '\t' { + write!(f, " \\t ")?; + } else if c == ' ' { + write!(f, " SPACE ")?; + } else { + write!(f, " 0x{:X?} ", c as u8)? + } + } + + write!(f, "]") + } +} + +/// A packed type containing a code value, as well as metadata about the symbol referred to by +/// the code. +/// +/// Logically, codes can range from 0-255 inclusive. This type holds both the 8-bit code as well as +/// other metadata bit-packed into a `u16`. +/// +/// The bottom 8 bits contain EITHER a code for a symbol stored in the table, OR a raw byte. +/// +/// The interpretation depends on the 9th bit: when toggled off, the value stores a raw byte, and when +/// toggled on, it stores a code. Thus if you examine the bottom 9 bits of the `u16`, you have an extended +/// code range, where the values 0-255 are raw bytes, and the values 256-510 represent codes 0-254. 511 is +/// a placeholder for the invalid code here. +/// +/// Bits 12-15 store the length of the symbol (values ranging from 0-8). +#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)] +struct Code(u16); + +/// Code used to indicate bytes that are not in the symbol table. +/// +/// When compressing a string that cannot fully be expressed with the symbol table, the compressed +/// output will contain an `ESCAPE` byte followed by a raw byte. At decompression time, the presence +/// of `ESCAPE` indicates that the next byte should be appended directly to the result instead of +/// being looked up in the symbol table. +pub const ESCAPE_CODE: u8 = 255; + +/// Number of bits in the `ExtendedCode` that are used to dictate a code value. +pub const FSST_CODE_BITS: usize = 9; + +/// First bit of the "length" portion of an extended code. +pub const FSST_LEN_BITS: usize = 12; + +/// Maximum code value in the extended code range. +pub const FSST_CODE_MAX: u16 = 1 << FSST_CODE_BITS; + +/// Maximum value for the extended code range. +/// +/// When truncated to u8 this is code 255, which is equivalent to [`ESCAPE_CODE`]. +pub const FSST_CODE_MASK: u16 = FSST_CODE_MAX - 1; + +/// First code in the symbol table that corresponds to a non-escape symbol. +pub const FSST_CODE_BASE: u16 = 256; + +#[allow(clippy::len_without_is_empty)] +impl Code { + /// Code for an unused slot in a symbol table or index. + /// + /// This corresponds to the maximum code with a length of 1. + pub const UNUSED: Self = Code(FSST_CODE_MASK + (1 << 12)); + + /// Create a new code for a symbol of given length. + fn new_symbol(code: u8, len: usize) -> Self { + Self(code as u16 + ((len as u16) << FSST_LEN_BITS)) + } + + /// Code for a new symbol during the building phase. + /// + /// The code is remapped from 0..254 to 256...510. + fn new_symbol_building(code: u8, len: usize) -> Self { + Self(code as u16 + 256 + ((len as u16) << FSST_LEN_BITS)) + } + + /// Create a new code corresponding for an escaped byte. + fn new_escape(byte: u8) -> Self { + Self((byte as u16) + (1 << FSST_LEN_BITS)) + } + + #[inline] + fn code(self) -> u8 { + self.0 as u8 + } + + #[inline] + fn extended_code(self) -> u16 { + self.0 & 0b111_111_111 + } + + #[inline] + fn len(self) -> u16 { + self.0 >> FSST_LEN_BITS + } +} + +impl Debug for Code { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + f.debug_struct("TrainingCode") + .field("code", &(self.0 as u8)) + .field("is_escape", &(self.0 < 256)) + .field("len", &(self.0 >> 12)) + .finish() + } +} + +/// Decompressor uses a symbol table to take a stream of 8-bit codes into a string. +#[derive(Clone)] +pub struct Decompressor<'a> { + /// Slice mapping codes to symbols. + pub(crate) symbols: &'a [Symbol], + + /// Slice containing the length of each symbol in the `symbols` slice. + pub(crate) lengths: &'a [u8], +} + +impl<'a> Decompressor<'a> { + /// Returns a new decompressor that uses the provided symbol table. + /// + /// # Panics + /// + /// If the provided symbol table has length greater than 256 + pub fn new(symbols: &'a [Symbol], lengths: &'a [u8]) -> Self { + assert!( + symbols.len() < FSST_CODE_BASE as usize, + "symbol table cannot have size exceeding 255" + ); + + Self { symbols, lengths } + } + + /// Returns an upper bound on the size of the decompressed data. + pub fn max_decompression_capacity(&self, compressed: &[u8]) -> usize { + size_of::() * (compressed.len() + 1) + } + + /// Decompress a slice of codes into a provided buffer. + /// + /// The provided `decoded` buffer must be at least the size of the decoded data, plus + /// an additional 7 bytes. + /// + /// ## Panics + /// + /// If the caller fails to provide sufficient capacity in the decoded buffer. An upper bound + /// on the required capacity can be obtained by calling [`Self::max_decompression_capacity`]. + /// + /// ## Example + /// + /// ``` + /// use fsst::{Symbol, Compressor, CompressorBuilder}; + /// let compressor = { + /// let mut builder = CompressorBuilder::new(); + /// builder.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', b'o', b'o', b'o']), 8); + /// builder.build() + /// }; + /// + /// let decompressor = compressor.decompressor(); + /// + /// let mut decompressed = Vec::with_capacity(8 + 7); + /// + /// let len = decompressor.decompress_into(&[0], decompressed.spare_capacity_mut()); + /// assert_eq!(len, 8); + /// unsafe { decompressed.set_len(len) }; + /// assert_eq!(&decompressed, "helloooo".as_bytes()); + /// ``` + pub fn decompress_into(&self, compressed: &[u8], decoded: &mut [MaybeUninit]) -> usize { + // Ensure the target buffer is at least half the size of the input buffer. + // This is the theortical smallest a valid target can be, and occurs when + // every input code is an escape. + assert!( + decoded.len() >= compressed.len() / 2, + "decoded is smaller than lower-bound decompressed size" + ); + + unsafe { + let mut in_ptr = compressed.as_ptr(); + let _in_begin = in_ptr; + let in_end = in_ptr.add(compressed.len()); + + let mut out_ptr: *mut u8 = decoded.as_mut_ptr().cast(); + let out_begin = out_ptr.cast_const(); + let out_end = decoded.as_ptr().add(decoded.len()).cast::(); + + macro_rules! store_next_symbol { + ($code:expr) => {{ + out_ptr + .cast::() + .write_unaligned(self.symbols.get_unchecked($code as usize).to_u64()); + out_ptr = out_ptr.add(*self.lengths.get_unchecked($code as usize) as usize); + }}; + } + + // First we try loading 8 bytes at a time. + if decoded.len() >= 8 * size_of::() && compressed.len() >= 8 { + // Extract the loop condition since the compiler fails to do so + let block_out_end = out_end.sub(8 * size_of::()); + let block_in_end = in_end.sub(8); + + while out_ptr.cast_const() <= block_out_end && in_ptr < block_in_end { + // Note that we load a little-endian u64 here. + let next_block = in_ptr.cast::().read_unaligned(); + let escape_mask = (next_block & 0x8080808080808080) + & ((((!next_block) & 0x7F7F7F7F7F7F7F7F) + 0x7F7F7F7F7F7F7F7F) + ^ 0x8080808080808080); + + // If there are no escape codes, we write each symbol one by one. + if escape_mask == 0 { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 16) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 24) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 32) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 40) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 48) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 56) & 0xFF) as u8; + store_next_symbol!(code); + in_ptr = in_ptr.add(8); + } else { + // Otherwise, find the first escape code and write the symbols up to that point. + let first_escape_pos = escape_mask.trailing_zeros() >> 3; // Divide bits to bytes + debug_assert!(first_escape_pos < 8); + match first_escape_pos { + 7 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 16) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 24) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 32) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 40) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 48) & 0xFF) as u8; + store_next_symbol!(code); + + in_ptr = in_ptr.add(7); + } + 6 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 16) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 24) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 32) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 40) & 0xFF) as u8; + store_next_symbol!(code); + + let escaped = ((next_block >> 56) & 0xFF) as u8; + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + + in_ptr = in_ptr.add(8); + } + 5 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 16) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 24) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 32) & 0xFF) as u8; + store_next_symbol!(code); + + let escaped = ((next_block >> 48) & 0xFF) as u8; + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + + in_ptr = in_ptr.add(7); + } + 4 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 16) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 24) & 0xFF) as u8; + store_next_symbol!(code); + + let escaped = ((next_block >> 40) & 0xFF) as u8; + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + + in_ptr = in_ptr.add(6); + } + 3 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 16) & 0xFF) as u8; + store_next_symbol!(code); + + let escaped = ((next_block >> 32) & 0xFF) as u8; + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + + in_ptr = in_ptr.add(5); + } + 2 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + let code = ((next_block >> 8) & 0xFF) as u8; + store_next_symbol!(code); + + let escaped = ((next_block >> 24) & 0xFF) as u8; + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + + in_ptr = in_ptr.add(4); + } + 1 => { + let code = (next_block & 0xFF) as u8; + store_next_symbol!(code); + + let escaped = ((next_block >> 16) & 0xFF) as u8; + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + + in_ptr = in_ptr.add(3); + } + 0 => { + // Otherwise, we actually need to decompress the next byte + // Extract the second byte from the u32 + let escaped = ((next_block >> 8) & 0xFF) as u8; + in_ptr = in_ptr.add(2); + out_ptr.write(escaped); + out_ptr = out_ptr.add(1); + } + _ => unreachable!(), + } + } + } + } + + // Otherwise, fall back to 1-byte reads. + while out_end.offset_from(out_ptr) > size_of::() as isize && in_ptr < in_end { + let code = in_ptr.read(); + in_ptr = in_ptr.add(1); + + if code == ESCAPE_CODE { + out_ptr.write(in_ptr.read()); + in_ptr = in_ptr.add(1); + out_ptr = out_ptr.add(1); + } else { + store_next_symbol!(code); + } + } + + assert_eq!( + in_ptr, in_end, + "decompression should exhaust input before output" + ); + + out_ptr.offset_from(out_begin) as usize + } + } + + /// Decompress a byte slice that was previously returned by a compressor using the same symbol + /// table into a new vector of bytes. + pub fn decompress(&self, compressed: &[u8]) -> Vec { + let mut decoded = Vec::with_capacity(self.max_decompression_capacity(compressed) + 7); + + let len = self.decompress_into(compressed, decoded.spare_capacity_mut()); + // SAFETY: len bytes have now been initialized by the decompressor. + unsafe { decoded.set_len(len) }; + decoded + } +} + +/// A compressor that uses a symbol table to greedily compress strings. +/// +/// The `Compressor` is the central component of FSST. You can create a compressor either by +/// default (i.e. an empty compressor), or by [training][`Self::train`] it on an input corpus of text. +/// +/// Example usage: +/// +/// ``` +/// use fsst::{Symbol, Compressor, CompressorBuilder}; +/// let compressor = { +/// let mut builder = CompressorBuilder::new(); +/// builder.insert(Symbol::from_slice(&[b'h', b'e', b'l', b'l', b'o', 0, 0, 0]), 5); +/// builder.build() +/// }; +/// +/// let compressed = compressor.compress("hello".as_bytes()); +/// assert_eq!(compressed, vec![0u8]); +/// ``` +#[derive(Clone)] +pub struct Compressor { + /// Table mapping codes to symbols. + pub(crate) symbols: Vec, + + /// Length of each symbol, values range from 1-8. + pub(crate) lengths: Vec, + + /// The number of entries in the symbol table that have been populated, not counting + /// the escape values. + pub(crate) n_symbols: u8, + + /// Inverted index mapping 2-byte symbols to codes + codes_two_byte: Vec, + + /// Limit of no suffixes. + has_suffix_code: u8, + + /// Lossy perfect hash table for looking up codes to symbols that are 3 bytes or more + lossy_pht: LossyPHT, +} + +/// The core structure of the FSST codec, holding a mapping between `Symbol`s and `Code`s. +/// +/// The symbol table is trained on a corpus of data in the form of a single byte array, building up +/// a mapping of 1-byte "codes" to sequences of up to 8 plaintext bytes, or "symbols". +impl Compressor { + /// Using the symbol table, runs a single cycle of compression on an input word, writing + /// the output into `out_ptr`. + /// + /// # Returns + /// + /// This function returns a tuple of (advance_in, advance_out) with the number of bytes + /// for the caller to advance the input and output pointers. + /// + /// `advance_in` is the number of bytes to advance the input pointer before the next call. + /// + /// `advance_out` is the number of bytes to advance `out_ptr` before the next call. + /// + /// # Safety + /// + /// `out_ptr` must never be NULL or otherwise point to invalid memory. + pub unsafe fn compress_word(&self, word: u64, out_ptr: *mut u8) -> (usize, usize) { + // Speculatively write the first byte of `word` at offset 1. This is necessary if it is an escape, and + // if it isn't, it will be overwritten anyway. + // + // SAFETY: caller ensures out_ptr is not null + let first_byte = word as u8; + // SAFETY: out_ptr is not null + unsafe { out_ptr.byte_add(1).write_unaligned(first_byte) }; + + // First, check the two_bytes table + let code_twobyte = self.codes_two_byte[word as u16 as usize]; + + if code_twobyte.code() < self.has_suffix_code { + // 2 byte code without having to worry about longer matches. + // SAFETY: out_ptr is not null. + unsafe { std::ptr::write(out_ptr, code_twobyte.code()) }; + + // Advance input by symbol length (2) and output by a single code byte + (2, 1) + } else { + // Probe the hash table + let entry = self.lossy_pht.lookup(word); + + // Now, downshift the `word` and the `entry` to see if they align. + let ignored_bits = entry.ignored_bits; + if entry.code != Code::UNUSED + && compare_masked(word, entry.symbol.to_u64(), ignored_bits) + { + // Advance the input by the symbol length (variable) and the output by one code byte + // SAFETY: out_ptr is not null. + unsafe { std::ptr::write(out_ptr, entry.code.code()) }; + (entry.code.len() as usize, 1) + } else { + // SAFETY: out_ptr is not null + unsafe { std::ptr::write(out_ptr, code_twobyte.code()) }; + + // Advance the input by the symbol length (variable) and the output by either 1 + // byte (if was one-byte code) or two bytes (escape). + ( + code_twobyte.len() as usize, + // Predicated version of: + // + // if entry.code >= 256 { + // 2 + // } else { + // 1 + // } + 1 + (code_twobyte.extended_code() >> 8) as usize, + ) + } + } + } + + /// Compress many lines in bulk. + pub fn compress_bulk(&self, lines: &Vec<&[u8]>) -> Vec> { + let mut res = Vec::new(); + + for line in lines { + res.push(self.compress(line)); + } + + res + } + + /// Compress a string, writing its result into a target buffer. + /// + /// The target buffer is a byte vector that must have capacity large enough + /// to hold the encoded data. + /// + /// When this call returns, `values` will hold the compressed bytes and have + /// its length set to the length of the compressed text. + /// + /// ``` + /// use fsst::{Compressor, CompressorBuilder, Symbol}; + /// + /// let mut compressor = CompressorBuilder::new(); + /// assert!(compressor.insert(Symbol::from_slice(b"aaaaaaaa"), 8)); + /// + /// let compressor = compressor.build(); + /// + /// let mut compressed_values = Vec::with_capacity(1_024); + /// + /// // SAFETY: we have over-sized compressed_values. + /// unsafe { + /// compressor.compress_into(b"aaaaaaaa", &mut compressed_values); + /// } + /// + /// assert_eq!(compressed_values, vec![0u8]); + /// ``` + /// + /// # Safety + /// + /// It is up to the caller to ensure the provided buffer is large enough to hold + /// all encoded data. + pub unsafe fn compress_into(&self, plaintext: &[u8], values: &mut Vec) { + let mut in_ptr = plaintext.as_ptr(); + let mut out_ptr = values.as_mut_ptr(); + + // SAFETY: `end` will point just after the end of the `plaintext` slice. + let in_end = unsafe { in_ptr.byte_add(plaintext.len()) }; + let in_end_sub8 = in_end as usize - 8; + // SAFETY: `end` will point just after the end of the `values` allocation. + let out_end = unsafe { out_ptr.byte_add(values.capacity()) }; + + while (in_ptr as usize) <= in_end_sub8 && out_ptr < out_end { + // SAFETY: pointer ranges are checked in the loop condition + unsafe { + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null. we may read past end of pointer though. + let word: u64 = std::ptr::read_unaligned(in_ptr as *const u64); + let (advance_in, advance_out) = self.compress_word(word, out_ptr); + in_ptr = in_ptr.byte_add(advance_in); + out_ptr = out_ptr.byte_add(advance_out); + }; + } + + let remaining_bytes = unsafe { in_end.byte_offset_from(in_ptr) }; + assert!( + out_ptr < out_end || remaining_bytes == 0, + "output buffer sized too small" + ); + + let remaining_bytes = remaining_bytes as usize; + + // Load the last `remaining_byte`s of data into a final world. We then replicate the loop above, + // but shift data out of this word rather than advancing an input pointer and potentially reading + // unowned memory. + let mut bytes = [0u8; 8]; + // SAFETY: remaining_bytes <= 8 + unsafe { std::ptr::copy_nonoverlapping(in_ptr, bytes.as_mut_ptr(), remaining_bytes) }; + let mut last_word = u64::from_le_bytes(bytes); + + while in_ptr < in_end && out_ptr < out_end { + // Load a full 8-byte word of data from in_ptr. + // SAFETY: caller asserts in_ptr is not null + let (advance_in, advance_out) = unsafe { self.compress_word(last_word, out_ptr) }; + // SAFETY: pointer ranges are checked in the loop condition + unsafe { + in_ptr = in_ptr.add(advance_in); + out_ptr = out_ptr.add(advance_out); + } + + last_word = advance_8byte_word(last_word, advance_in); + } + + // in_ptr should have exceeded in_end + assert!( + in_ptr >= in_end, + "exhausted output buffer before exhausting input, there is a bug in SymbolTable::compress()" + ); + + assert!(out_ptr <= out_end, "output buffer sized too small"); + + // SAFETY: out_ptr is derived from the `values` allocation. + let bytes_written = unsafe { out_ptr.offset_from(values.as_ptr()) }; + assert!( + bytes_written >= 0, + "out_ptr ended before it started, not possible" + ); + + // SAFETY: we have initialized `bytes_written` values in the output buffer. + unsafe { values.set_len(bytes_written as usize) }; + } + + /// Use the symbol table to compress the plaintext into a sequence of codes and escapes. + pub fn compress(&self, plaintext: &[u8]) -> Vec { + if plaintext.is_empty() { + return Vec::new(); + } + + let mut buffer = Vec::with_capacity(plaintext.len() * 2); + + // SAFETY: the largest compressed size would be all escapes == 2*plaintext_len + unsafe { self.compress_into(plaintext, &mut buffer) }; + + buffer + } + + /// Access the decompressor that can be used to decompress strings emitted from this + /// `Compressor` instance. + pub fn decompressor(&self) -> Decompressor<'_> { + Decompressor::new(self.symbol_table(), self.symbol_lengths()) + } + + /// Returns a readonly slice of the current symbol table. + /// + /// The returned slice will have length of `n_symbols`. + pub fn symbol_table(&self) -> &[Symbol] { + &self.symbols[0..self.n_symbols as usize] + } + + /// Returns a readonly slice where index `i` contains the + /// length of the symbol represented by code `i`. + /// + /// Values range from 1-8. + pub fn symbol_lengths(&self) -> &[u8] { + &self.lengths[0..self.n_symbols as usize] + } + + /// Rebuild a compressor from an existing symbol table. + /// + /// This will not attempt to optimize or re-order the codes. + pub fn rebuild_from(symbols: impl AsRef<[Symbol]>, symbol_lens: impl AsRef<[u8]>) -> Self { + let symbols = symbols.as_ref(); + let symbol_lens = symbol_lens.as_ref(); + + assert_eq!( + symbols.len(), + symbol_lens.len(), + "symbols and lengths differ" + ); + assert!( + symbols.len() <= 255, + "symbol table len must be <= 255, was {}", + symbols.len() + ); + validate_symbol_order(symbol_lens); + + // Insert the symbols in their given order into the FSST lookup structures. + let symbols = symbols.to_vec(); + let lengths = symbol_lens.to_vec(); + let mut lossy_pht = LossyPHT::new(); + + let mut codes_one_byte = vec![Code::UNUSED; 256]; + + // Insert all of the one byte symbols first. + for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { + if len == 1 { + codes_one_byte[symbol.first_byte() as usize] = Code::new_symbol(code as u8, 1); + } + } + + // Initialize the codes_two_byte table to be all escapes + let mut codes_two_byte = vec![Code::UNUSED; 65_536]; + + // Insert the two byte symbols, possibly overwriting slots for one-byte symbols and escapes. + for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { + match len { + 2 => { + codes_two_byte[symbol.first2() as usize] = Code::new_symbol(code as u8, 2); + } + 3.. => { + assert!( + lossy_pht.insert(symbol, len as usize, code as u8), + "rebuild symbol insertion into PHT must succeed" + ); + } + _ => { /* Covered by the 1-byte loop above. */ } + } + } + + // Build the finished codes_two_byte table, subbing in unused positions with the + // codes_one_byte value similar to what we do in CompressBuilder::finalize. + for (symbol, code) in codes_two_byte.iter_mut().enumerate() { + if *code == Code::UNUSED { + *code = codes_one_byte[symbol & 0xFF]; + } + } + + // Find the position of the first 2-byte code that has a suffix later in the table + let mut has_suffix_code = 0u8; + for (code, (&symbol, &len)) in symbols.iter().zip(lengths.iter()).enumerate() { + if len != 2 { + break; + } + let rest = &symbols[code..]; + if rest + .iter() + .any(|&other| other.len() > 2 && symbol.first2() == other.first2()) + { + has_suffix_code = code as u8; + break; + } + } + + Compressor { + n_symbols: symbols.len() as u8, + symbols, + lengths, + codes_two_byte, + lossy_pht, + has_suffix_code, + } + } +} + +#[inline] +pub(crate) fn advance_8byte_word(word: u64, bytes: usize) -> u64 { + // shift the word off the low-end, because little endian means the first + // char is stored in the LSB. + // + // Note that even though this looks like it branches, Rust compiles this to a + // conditional move instruction. See `` + if bytes == 8 { 0 } else { word >> (8 * bytes) } +} + +fn validate_symbol_order(symbol_lens: &[u8]) { + // Ensure that the symbol table is ordered by length, 23456781 + let mut expected = 2; + for (idx, &len) in symbol_lens.iter().enumerate() { + if expected == 1 { + assert_eq!( + len, 1, + "symbol code={idx} should be one byte, was {len} bytes" + ); + } else { + if len == 1 { + expected = 1; + } + + // we're in the non-zero portion. + assert!( + len >= expected, + "symbol code={idx} breaks violates FSST symbol table ordering" + ); + expected = len; + } + } +} + +#[inline] +pub(crate) fn compare_masked(left: u64, right: u64, ignored_bits: u16) -> bool { + let mask = u64::MAX >> ignored_bits; + (left & mask) == right +} + +#[cfg(test)] +mod test { + use std::iter; + use std::mem; + + use super::*; + #[test] + fn test_stuff() { + let compressor = { + let mut builder = CompressorBuilder::new(); + builder.insert(Symbol::from_slice(b"helloooo"), 8); + builder.build() + }; + + let decompressor = compressor.decompressor(); + + let mut decompressed = Vec::with_capacity(8 + 7); + + let len = decompressor.decompress_into(&[0], decompressed.spare_capacity_mut()); + assert_eq!(len, 8); + unsafe { decompressed.set_len(len) }; + assert_eq!(&decompressed, "helloooo".as_bytes()); + } + + #[test] + fn test_symbols_good() { + let symbols_u64: &[u64] = &[ + 24931, 25698, 25442, 25699, 25186, 25444, 24932, 25188, 25185, 25441, 25697, 25700, + 24929, 24930, 25443, 25187, 6513249, 6512995, 6578786, 6513761, 6513507, 6382434, + 6579042, 6512994, 6447460, 6447969, 6382178, 6579041, 6512993, 6448226, 6513250, + 6579297, 6513506, 6447459, 6513764, 6447458, 6578529, 6382180, 6513762, 6447714, + 6579299, 6513508, 6382436, 6513763, 6578532, 6381924, 6448228, 6579300, 6381921, + 6382690, 6382179, 6447713, 6447972, 6513505, 6447457, 6382692, 6513252, 6578785, + 6578787, 6578531, 6448225, 6382177, 6382433, 6578530, 6448227, 6381922, 6578788, + 6579044, 6382691, 6512996, 6579043, 6579298, 6447970, 6447716, 6447971, 6381923, + 6447715, 97, 98, 100, 99, 97, 98, 99, 100, + ]; + let symbols: &[Symbol] = unsafe { mem::transmute(symbols_u64) }; + let lens: Vec = iter::repeat_n(2u8, 16) + .chain(iter::repeat_n(3u8, 61)) + .chain(iter::repeat_n(1u8, 8)) + .collect(); + + let compressor = Compressor::rebuild_from(symbols, lens); + let built_symbols: &[u64] = unsafe { mem::transmute(compressor.symbol_table()) }; + assert_eq!(built_symbols, symbols_u64); + } + + #[should_panic(expected = "assertion `left == right` failed")] + #[test] + fn test_symbols_bad() { + let symbols: &[u64] = &[ + 24931, 25698, 25442, 25699, 25186, 25444, 24932, 25188, 25185, 25441, 25697, 25700, + 24929, 24930, 25443, 25187, 6513249, 6512995, 6578786, 6513761, 6513507, 6382434, + 6579042, 6512994, 6447460, 6447969, 6382178, 6579041, 6512993, 6448226, 6513250, + 6579297, 6513506, 6447459, 6513764, 6447458, 6578529, 6382180, 6513762, 6447714, + 6579299, 6513508, 6382436, 6513763, 6578532, 6381924, 6448228, 6579300, 6381921, + 6382690, 6382179, 6447713, 6447972, 6513505, 6447457, 6382692, 6513252, 6578785, + 6578787, 6578531, 6448225, 6382177, 6382433, 6578530, 6448227, 6381922, 6578788, + 6579044, 6382691, 6512996, 6579043, 6579298, 6447970, 6447716, 6447971, 6381923, + 6447715, 97, 98, 100, 99, 97, 98, 99, 100, + ]; + let lens: Vec = iter::repeat_n(2u8, 16) + .chain(iter::repeat_n(3u8, 61)) + .chain(iter::repeat_n(1u8, 8)) + .collect(); + + let mut builder = CompressorBuilder::new(); + for (symbol, len) in symbols.iter().zip(lens.iter()) { + let symbol = Symbol::from_slice(&symbol.to_le_bytes()); + builder.insert(symbol, *len as usize); + } + let compressor = builder.build(); + let built_symbols: &[u64] = unsafe { mem::transmute(compressor.symbol_table()) }; + assert_eq!(built_symbols, symbols); + } +} diff --git a/vendor/fsst-rs/src/lossy_pht.rs b/vendor/fsst-rs/src/lossy_pht.rs new file mode 100644 index 00000000000..83a32030be4 --- /dev/null +++ b/vendor/fsst-rs/src/lossy_pht.rs @@ -0,0 +1,128 @@ +use std::fmt::Debug; + +use crate::Code; +use crate::Symbol; +use crate::builder::fsst_hash; + +/// Size of the perfect hash table. +/// +/// NOTE: this differs from the paper, which recommends a 64KB total +/// table size. The paper does not account for the fact that most +/// vendors split the L1 cache into 32KB of instruction and 32KB of data. +pub const HASH_TABLE_SIZE: usize = 1 << 11; + +/// A single entry in the [Lossy Perfect Hash Table][`LossyPHT`]. +/// +/// `TableEntry` is based on the `Symbol` class outlined in Algorithm 4 of the FSST paper. See +/// the module documentation for a link to the paper. +#[derive(Clone, Debug)] +#[repr(C)] +pub(crate) struct TableEntry { + /// Symbol, piece of a string, 8 bytes or fewer. + pub(crate) symbol: Symbol, + + /// Code and associated metadata for the symbol + pub(crate) code: Code, + + /// Number of ignored bits in `symbol`. + /// + /// This is equivalent to `64 - 8 * code.len()` but is pre-computed to save a few instructions in + /// the compression loop. + pub(crate) ignored_bits: u16, +} + +assert_sizeof!(TableEntry => 16); + +impl TableEntry { + pub(crate) fn is_unused(&self) -> bool { + self.code == Code::UNUSED + } +} + +/// Lossy Perfect Hash Table implementation for compression. +/// +/// This implements the "Lossy Perfect Hash Table" described in Section 5 of the paper. +/// +/// It is so-called because the `insert` operation for a symbol may fail, if another symbol is +/// already occupying the slot. +/// +/// If insertions are made from highest-gain to lowest and from longest-symbol to shortest, then +/// we can say that any failed insert is not a big loss, because its slot is being held by a higher-gain +/// symbol. Note that because other code in this crate calls `insert` in the pop-order of a max heap, +/// this holds. +#[derive(Clone, Debug)] +pub(crate) struct LossyPHT { + /// Hash table slots. Used for strings that are 3 bytes or more. + slots: Vec, +} + +impl LossyPHT { + /// Construct a new empty lossy perfect hash table + pub(crate) fn new() -> Self { + let slots = vec![ + TableEntry { + symbol: Symbol::ZERO, + code: Code::UNUSED, + ignored_bits: 64, + }; + HASH_TABLE_SIZE + ]; + + Self { slots } + } + + /// Try and insert the (symbol, code) pair into the table. + /// + /// If there is a collision, we keep the current thing and reject the write. + /// + /// # Returns + /// + /// True if the symbol was inserted into the table, false if it was rejected due to collision. + pub(crate) fn insert(&mut self, symbol: Symbol, len: usize, code: u8) -> bool { + let prefix_3bytes = symbol.to_u64() & 0xFF_FF_FF; + let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + let entry = &mut self.slots[slot]; + if !entry.is_unused() { + false + } else { + entry.symbol = symbol; + entry.code = Code::new_symbol_building(code, len); + entry.ignored_bits = (64 - 8 * symbol.len()) as u16; + true + } + } + + /// Given a new code mapping, rewrite the codes into the new code range. + pub(crate) fn renumber(&mut self, new_codes: &[u8]) { + for slot in self.slots.iter_mut() { + if slot.code != Code::UNUSED { + let old_code = slot.code.code(); + let new_code = new_codes[old_code as usize]; + let len = slot.code.len(); + slot.code = Code::new_symbol(new_code, len as usize); + } + } + } + + /// Remove the symbol from the hashtable, if it exists. + pub(crate) fn remove(&mut self, symbol: Symbol) { + let prefix_3bytes = symbol.to_u64() & 0xFF_FF_FF; + let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + self.slots[slot].code = Code::UNUSED; + } + + #[inline] + pub(crate) fn lookup(&self, word: u64) -> &TableEntry { + let prefix_3bytes = word & 0xFF_FF_FF; + let slot = fsst_hash(prefix_3bytes) as usize & (HASH_TABLE_SIZE - 1); + + // SAFETY: the slot is guaranteed to between [0, HASH_TABLE_SIZE). + unsafe { self.slots.get_unchecked(slot) } + } +} + +impl Default for LossyPHT { + fn default() -> Self { + Self::new() + } +} From 93e5df7a6e7f26f14b2da0fc5e2f53cb382e5462 Mon Sep 17 00:00:00 2001 From: Joe Isaacs Date: Thu, 12 Mar 2026 14:34:02 +0000 Subject: [PATCH 2/2] wip: fsst dedup Signed-off-by: Joe Isaacs --- Cargo.toml | 2 +- vendor/fsst-rs/Cargo.toml | 1 + vendor/fsst-rs/src/builder.rs | 3 +++ vendor/fsst-rs/src/lib.rs | 3 +++ vendor/fsst-rs/src/lossy_pht.rs | 3 +++ 5 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Cargo.toml b/Cargo.toml index d8d111f0822..71a58ae75e9 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -144,7 +144,7 @@ enum-iterator = "2.0.0" env_logger = "0.11" fastlanes = "0.5" flatbuffers = "25.2.10" -fsst-rs = { path = "vendor/fsst-rs" } +fsst-rs = { version = "0.5.6", path = "vendor/fsst-rs" } futures = { version = "0.3.31", default-features = false } fuzzy-matcher = "0.3" get_dir = "0.5.0" diff --git a/vendor/fsst-rs/Cargo.toml b/vendor/fsst-rs/Cargo.toml index 666a2166c7a..0f8488766a6 100644 --- a/vendor/fsst-rs/Cargo.toml +++ b/vendor/fsst-rs/Cargo.toml @@ -10,6 +10,7 @@ repository = "https://github.com/spiraldb/fsst" readme = "README.md" keywords = ["compression", "fsst"] categories = ["compression"] +publish = false [lib] name = "fsst" diff --git a/vendor/fsst-rs/src/builder.rs b/vendor/fsst-rs/src/builder.rs index f2b911c8b5b..4ef9b9b220c 100644 --- a/vendor/fsst-rs/src/builder.rs +++ b/vendor/fsst-rs/src/builder.rs @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright SpiralDB Developers + //! Functions and types used for building a [`Compressor`] from a corpus of text. //! //! This module implements the logic from Algorithm 3 of the [FSST Paper]. diff --git a/vendor/fsst-rs/src/lib.rs b/vendor/fsst-rs/src/lib.rs index 33d843d48d8..74aa795105c 100644 --- a/vendor/fsst-rs/src/lib.rs +++ b/vendor/fsst-rs/src/lib.rs @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright SpiralDB Developers + #![doc = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"] #![cfg(target_endian = "little")] diff --git a/vendor/fsst-rs/src/lossy_pht.rs b/vendor/fsst-rs/src/lossy_pht.rs index 83a32030be4..efc765ad24e 100644 --- a/vendor/fsst-rs/src/lossy_pht.rs +++ b/vendor/fsst-rs/src/lossy_pht.rs @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright SpiralDB Developers + use std::fmt::Debug; use crate::Code;