Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@ members = [
"encodings/zigzag",
"encodings/zstd",
"encodings/bytebool",
# Vendored dependencies
"vendor/fsst-rs",
# Benchmarks
"benchmarks/lance-bench",
"benchmarks/compress-bench",
Expand Down Expand Up @@ -142,7 +144,7 @@ enum-iterator = "2.0.0"
env_logger = "0.11"
fastlanes = "0.5"
flatbuffers = "25.2.10"
fsst-rs = "0.5.5"
fsst-rs = { version = "0.5.6", path = "vendor/fsst-rs" }
futures = { version = "0.3.31", default-features = false }
fuzzy-matcher = "0.3"
get_dir = "0.5.0"
Expand Down
59 changes: 59 additions & 0 deletions encodings/fsst/examples/fsst_symbol_table.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

//! Reads lines from stdin, trains an FSST symbol table, and prints it.
//!
//! Usage:
//! cat urls.txt | cargo run -p vortex-fsst --example fsst_symbol_table
//! duckdb -csv -noheader -c "SELECT URL FROM 'hits_0.parquet' LIMIT 100000" | cargo run ...

#![allow(clippy::expect_used)]

use std::io;
use std::io::BufRead;

use vortex_array::arrays::VarBinArray;
use vortex_array::dtype::DType;
use vortex_array::dtype::Nullability;
use vortex_fsst::fsst_compress;
use vortex_fsst::fsst_train_compressor;

fn main() {
let stdin = io::stdin();
let lines: Vec<Option<Box<[u8]>>> = stdin
.lock()
.lines()
.map(|l| {
l.expect("failed to read line")
.into_bytes()
.into_boxed_slice()
})
.map(Some)
.collect();

let n = lines.len();
eprintln!("Read {n} lines from stdin");

let varbin = VarBinArray::from_iter(lines, DType::Utf8(Nullability::NonNullable));
let compressor = fsst_train_compressor(&varbin);
let fsst_array = fsst_compress(&varbin, &compressor);

print!("{}", fsst_array.format_symbol_table());

// Report duplicate symbols in the table.
let symbols = compressor.symbol_table();
let lengths = compressor.symbol_lengths();
let total = symbols.len();
let mut keys: Vec<(u64, u8)> = symbols
.iter()
.zip(lengths.iter())
.map(|(sym, &len)| (sym.to_u64(), len))
.collect();
keys.sort();
let unique_count = {
keys.dedup();
keys.len()
};
let duplicates = total - unique_count;
eprintln!("Symbol table: {total} symbols, {duplicates} duplicates");
}
76 changes: 76 additions & 0 deletions encodings/fsst/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

use std::fmt::Debug;
use std::fmt::Formatter;
use std::fmt::Write;
use std::hash::Hash;
use std::sync::Arc;
use std::sync::LazyLock;
Expand Down Expand Up @@ -514,6 +515,46 @@ impl FSSTArray {
pub fn compressor(&self) -> &Compressor {
self.compressor.as_ref()
}

/// Format the FSST symbol table as a human-readable table.
///
/// Each row shows the symbol code, byte length, hex bytes, and a text representation
/// where printable ASCII is shown as-is and other bytes are escaped.
pub fn format_symbol_table(&self) -> String {
fn escape_byte(b: u8) -> String {
match b {
b'\n' => "\\n".to_string(),
b'\t' => "\\t".to_string(),
b'\r' => "\\r".to_string(),
b if b.is_ascii_graphic() || b == b' ' => (b as char).to_string(),
b => format!("\\x{b:02x}"),
}
}

let symbols = self.symbols.as_slice();
let lengths = self.symbol_lengths.as_slice();
let n = symbols.len();

let mut out = format!("FSST Symbol Table ({n} symbols)\n");
let _ = writeln!(out, "{:>4} | {:>3} | {:<23} | Text", "Code", "Len", "Hex");
let _ = writeln!(out, "-----+-----+-------------------------+------");

for (code, (sym, &len)) in symbols.iter().zip(lengths.iter()).enumerate() {
let bytes = sym.to_u64().to_le_bytes();
let actual = &bytes[..len as usize];

let hex: String = actual
.iter()
.map(|b| format!("{b:02x}"))
.collect::<Vec<_>>()
.join(" ");

let text: String = actual.iter().map(|&b| escape_byte(b)).collect();

let _ = writeln!(out, "{code:>4} | {len:>3} | {hex:<23} | {text}");
}
out
}
}

impl ValidityChild<FSST> for FSST {
Expand Down Expand Up @@ -544,7 +585,9 @@ mod test {

use crate::FSST;
use crate::array::FSSTMetadata;
use crate::fsst_compress;
use crate::fsst_compress_iter;
use crate::fsst_train_compressor;

#[cfg_attr(miri, ignore)]
#[test]
Expand Down Expand Up @@ -628,4 +671,37 @@ mod test {
})
.unwrap()
}

#[test]
fn test_format_symbol_table_urls() {
use vortex_array::arrays::VarBinArray;

// Clickbench-style URL data
let urls: Vec<Option<&[u8]>> = vec![
Some(b"http://smeshariki.ru/GameMain.aspx?id=123&tab=1#ref=123"),
Some(b"http://smeshariki.ru/index.php?id=456&tab=2#ref=456"),
Some(b"http://auto.ru/cars/used/sale?id=789&tab=3#ref=789"),
Some(b"http://komme.ru/search?id=1000&tab=4#ref=1000"),
Some(b"http://yandex.ru/news/article?id=2000&tab=5#ref=2000"),
Some(b"http://mail.ru/user/profile?id=3000&tab=6#ref=3000"),
Some(b"http://smeshariki.ru/catalog/item?id=4000&tab=7#ref=4000"),
Some(b"http://auto.ru/forum/thread?id=5000&tab=8#ref=5000"),
Some(b"http://vk.com/photo/album?id=6000&tab=9#ref=6000"),
Some(b"http://livejournal.com/blog/post?id=7000&tab=10#ref=7000"),
];

let varbin = VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable));
let compressor = fsst_train_compressor(&varbin);
let fsst_array = fsst_compress(&varbin, &compressor);

let table = fsst_array.format_symbol_table();
eprintln!("{table}");

// Verify basic structure
assert!(table.starts_with("FSST Symbol Table ("));
assert!(table.contains("Code"));
assert!(table.contains("Len"));
assert!(table.contains("Hex"));
assert!(table.contains("Text"));
}
}
56 changes: 56 additions & 0 deletions vendor/fsst-rs/Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
[package]
name = "fsst-rs"
version = "0.5.6"
edition = "2024"
rust-version = "1.86.0"
authors = ["SpiralDB Developers <hello@spiraldb.com>"]
description = "Pure-Rust implementation of Fast Static Symbol Tables algorithm for string compression"
license = "Apache-2.0"
repository = "https://github.com/spiraldb/fsst"
readme = "README.md"
keywords = ["compression", "fsst"]
categories = ["compression"]
publish = false

[lib]
name = "fsst"
path = "src/lib.rs"

[lints.clippy]
or_fun_call = "deny"

[lints.clippy.all]
level = "deny"
priority = -1

[lints.clippy.if_then_some_else_none]
level = "deny"
priority = 0

[lints.clippy.mem_forget]
level = "deny"
priority = 0

[lints.clippy.panic_in_result_fn]
level = "deny"
priority = 0

[lints.clippy.same_name_method]
level = "deny"
priority = 0

[lints.clippy.tests_outside_test_module]
level = "deny"
priority = 0

[lints.clippy.unwrap_in_result]
level = "deny"
priority = 0

[lints.clippy.use_debug]
level = "deny"
priority = 0

[lints.rust]
missing_docs = "deny"
warnings = "deny"
Loading
Loading