diff --git a/Cargo.toml b/Cargo.toml index a025edb2769..b2eea33447b 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -81,6 +81,7 @@ rust-version = "1.90" version = "0.1.0" [workspace.dependencies] +aho-corasick = "1.1.3" anyhow = "1.0.97" arbitrary = "1.3.2" arc-swap = "1.8" @@ -121,6 +122,7 @@ cudarc = { version = "0.18.2", features = [ "cuda-12050", ] } custom-labels = "0.4.4" +daachorse = "1.0.0" dashmap = "6.1.0" datafusion = { version = "52", default-features = false, features = ["sql"] } datafusion-catalog = { version = "52" } @@ -155,6 +157,7 @@ indicatif = "0.18.0" insta = "1.43" inventory = "0.3.20" itertools = "0.14.0" +jetscii = "0.5.3" jiff = "0.2.0" kanal = "0.1.1" lending-iterator = "0.1.7" @@ -163,6 +166,7 @@ libloading = "0.8" liblzma = "0.4" log = { version = "0.4.21" } loom = { version = "0.7", features = ["checkpoint"] } +memchr = "2.8.0" memmap2 = "0.9.5" mimalloc = "0.1.42" moka = { version = "0.12.10", default-features = false } @@ -196,6 +200,7 @@ rand = "0.9.0" rand_distr = "0.5" ratatui = { version = "0.30", default-features = false } regex = "1.11.0" +regex-automata = "0.4" reqwest = { version = "0.12.4", features = [ "charset", "http2", diff --git a/encodings/fsst/Cargo.toml b/encodings/fsst/Cargo.toml index 0dd5ce55a22..eb08bbda959 100644 --- a/encodings/fsst/Cargo.toml +++ b/encodings/fsst/Cargo.toml @@ -38,10 +38,17 @@ vortex-array = { workspace = true, features = ["_test-harness"] } [[bench]] name = "fsst_compress" harness = false +required-features = ["_test-harness"] + +[[bench]] +name = "fsst_contains" +harness = false +required-features = ["_test-harness"] [[bench]] name = "fsst_url_compare" harness = false +required-features = ["_test-harness"] [[bench]] name = "chunked_dict_fsst_builder" diff --git a/encodings/fsst/benches/fsst_contains.rs b/encodings/fsst/benches/fsst_contains.rs new file mode 100644 index 00000000000..6885ad0543e --- /dev/null +++ b/encodings/fsst/benches/fsst_contains.rs @@ -0,0 +1,112 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +#![allow(clippy::unwrap_used)] + +use std::fmt; +use std::sync::LazyLock; + +use divan::Bencher; +use vortex_array::Canonical; +use vortex_array::IntoArray; +use vortex_array::VortexSessionExecute; +use vortex_array::arrays::ConstantArray; +use vortex_array::arrays::scalar_fn::ScalarFnArrayExt; +use vortex_array::scalar_fn::fns::like::Like; +use vortex_array::scalar_fn::fns::like::LikeOptions; +use vortex_array::session::ArraySession; +use vortex_fsst::FSSTArray; +use vortex_fsst::test_utils::NUM_STRINGS; +use vortex_fsst::test_utils::make_fsst_clickbench_urls; +use vortex_fsst::test_utils::make_fsst_emails; +use vortex_fsst::test_utils::make_fsst_file_paths; +use vortex_fsst::test_utils::make_fsst_json_strings; +use vortex_fsst::test_utils::make_fsst_log_lines; +use vortex_fsst::test_utils::make_fsst_rare_match; +use vortex_fsst::test_utils::make_fsst_short_urls; +use vortex_session::VortexSession; + +fn main() { + divan::main(); +} + +static SESSION: LazyLock = + LazyLock::new(|| VortexSession::empty().with::()); + +const N: usize = NUM_STRINGS; + +static FSST_URLS: LazyLock = LazyLock::new(|| make_fsst_short_urls(N)); +static FSST_CB_URLS: LazyLock = LazyLock::new(|| make_fsst_clickbench_urls(N)); +static FSST_LOG_LINES: LazyLock = LazyLock::new(|| make_fsst_log_lines(N)); +static FSST_JSON_STRINGS: LazyLock = LazyLock::new(|| make_fsst_json_strings(N)); +static FSST_FILE_PATHS: LazyLock = LazyLock::new(|| make_fsst_file_paths(N)); +static FSST_EMAILS: LazyLock = LazyLock::new(|| make_fsst_emails(N)); +static FSST_RARE_MATCH: LazyLock = LazyLock::new(|| make_fsst_rare_match(N)); + +enum Dataset { + Urls, + Cb, + Log, + Json, + Path, + Email, + Rare, +} + +impl fmt::Display for Dataset { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + Self::Urls => f.write_str("urls"), + Self::Cb => f.write_str("cb"), + Self::Log => f.write_str("log"), + Self::Json => f.write_str("json"), + Self::Path => f.write_str("path"), + Self::Email => f.write_str("email"), + Self::Rare => f.write_str("rare"), + } + } +} + +impl Dataset { + fn fsst_array(&self) -> &'static FSSTArray { + match self { + Self::Urls => &FSST_URLS, + Self::Cb => &FSST_CB_URLS, + Self::Log => &FSST_LOG_LINES, + Self::Json => &FSST_JSON_STRINGS, + Self::Path => &FSST_FILE_PATHS, + Self::Email => &FSST_EMAILS, + Self::Rare => &FSST_RARE_MATCH, + } + } + + fn pattern(&self) -> &'static str { + match self { + Self::Urls => "%google%", + Self::Cb => "%yandex%", + Self::Log => "%Googlebot%", + Self::Json => "%enterprise%", + Self::Path => "%target/release%", + Self::Email => "%gmail%", + Self::Rare => "%xyzzy%", + } + } +} + +#[divan::bench(args = [ + Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json, + Dataset::Path, Dataset::Email, Dataset::Rare, +])] +fn fsst_like(bencher: Bencher, dataset: &Dataset) { + let fsst = dataset.fsst_array(); + let len = fsst.len(); + let arr = fsst.clone().into_array(); + let pattern = ConstantArray::new(dataset.pattern(), len).into_array(); + bencher.bench_local(|| { + Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()]) + .unwrap() + .into_array() + .execute::(&mut SESSION.create_execution_ctx()) + .unwrap() + }); +} diff --git a/encodings/fsst/benches/fsst_url_compare.rs b/encodings/fsst/benches/fsst_url_compare.rs index 57bcde80cc3..6dc3ddbe087 100644 --- a/encodings/fsst/benches/fsst_url_compare.rs +++ b/encodings/fsst/benches/fsst_url_compare.rs @@ -6,9 +6,6 @@ use std::sync::LazyLock; use divan::Bencher; -use rand::Rng; -use rand::SeedableRng; -use rand::rngs::StdRng; use vortex_array::IntoArray; use vortex_array::RecursiveCanonical; use vortex_array::VortexSessionExecute; @@ -16,8 +13,6 @@ use vortex_array::arrays::ConstantArray; use vortex_array::arrays::VarBinArray; use vortex_array::builtins::ArrayBuiltins; use vortex_array::compute::warm_up_vtables; -use vortex_array::dtype::DType; -use vortex_array::dtype::Nullability; use vortex_array::expr::like; use vortex_array::expr::lit; use vortex_array::expr::root; @@ -26,6 +21,10 @@ use vortex_array::scalar_fn::fns::operators::Operator; use vortex_array::session::ArraySession; use vortex_fsst::fsst_compress; use vortex_fsst::fsst_train_compressor; +use vortex_fsst::test_utils::HIGH_MATCH_DOMAIN; +use vortex_fsst::test_utils::LOW_MATCH_DOMAIN; +use vortex_fsst::test_utils::NUM_STRINGS; +use vortex_fsst::test_utils::generate_url_data; use vortex_session::VortexSession; fn main() { @@ -36,76 +35,7 @@ fn main() { static SESSION: LazyLock = LazyLock::new(|| VortexSession::empty().with::()); -const NUM_URLS: usize = 100_000; - -/// A high-frequency domain that appears in ~50% of generated URLs. -const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru"; - -/// A low-frequency domain that appears in ~1% of generated URLs. -const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com"; - -// Domains modeled after real ClickBench URL distributions. -const DOMAINS: &[(&str, u32)] = &[ - ("smeshariki.ru", 500), // ~50% - ("auto.ru", 150), // ~15% - ("komme.ru", 100), // ~10% - ("yandex.ru", 80), // ~8% - ("mail.ru", 60), // ~6% - ("livejournal.com", 40), // ~4% - ("vk.com", 30), // ~3% - ("avito.ru", 20), // ~2% - ("kinopoisk.ru", 10), // ~1% - ("rare-example-domain.com", 10), // ~1% -]; - -const PATHS: &[&str] = &[ - "/GameMain.aspx", - "/index.php", - "/catalog/item", - "/search", - "/news/article", - "/user/profile", - "/collection/view", - "/cars/used/sale", - "/forum/thread", - "/photo/album", - "/video/watch", - "/download/file", - "/api/v1/resource", - "/shop/product", - "/blog/post", -]; - -/// Generate 100k realistic ClickBench-style URLs. -fn generate_url_data() -> VarBinArray { - let mut rng = StdRng::seed_from_u64(42); - - // Build a weighted domain lookup. - let total_weight: u32 = DOMAINS.iter().map(|(_, w)| w).sum(); - let urls: Vec>> = (0..NUM_URLS) - .map(|_| { - let domain_roll = rng.random_range(0..total_weight); - let mut cumulative = 0u32; - let mut domain = DOMAINS[0].0; - for &(d, w) in DOMAINS { - cumulative += w; - if domain_roll < cumulative { - domain = d; - break; - } - } - - let path = PATHS[rng.random_range(0..PATHS.len())]; - let query_id: u32 = rng.random_range(1..100_000); - let tab: u16 = rng.random_range(1..20); - - let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}"); - Some(url.into_bytes().into_boxed_slice()) - }) - .collect(); - - VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable)) -} +const NUM_URLS: usize = NUM_STRINGS; static URL_DATA: LazyLock = LazyLock::new(generate_url_data); diff --git a/encodings/fsst/src/test_utils.rs b/encodings/fsst/src/test_utils.rs index fcf0d331c5e..b078229b7c1 100644 --- a/encodings/fsst/src/test_utils.rs +++ b/encodings/fsst/src/test_utils.rs @@ -16,6 +16,7 @@ use vortex_array::dtype::NativePType; use vortex_array::dtype::Nullability; use vortex_error::VortexExpect; +use crate::FSSTArray; use crate::fsst_compress; use crate::fsst_train_compressor; @@ -59,3 +60,527 @@ pub fn gen_dict_fsst_test_data( DictArray::try_new(codes.into_array(), values) .vortex_expect("DictArray::try_new should succeed for test data") } + +// --------------------------------------------------------------------------- +// Benchmark dataset generators +// --------------------------------------------------------------------------- + +pub const NUM_STRINGS: usize = 100_000; + +// --------------------------------------------------------------------------- +// URL generator (ClickBench-style weighted domains) +// --------------------------------------------------------------------------- + +pub const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru"; +pub const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com"; + +pub const URL_DOMAINS: &[(&str, u32)] = &[ + ("smeshariki.ru", 500), + ("auto.ru", 150), + ("komme.ru", 100), + ("yandex.ru", 80), + ("mail.ru", 60), + ("livejournal.com", 40), + ("vk.com", 30), + ("avito.ru", 20), + ("kinopoisk.ru", 10), + ("rare-example-domain.com", 10), +]; + +pub const URL_PATHS: &[&str] = &[ + "/GameMain.aspx", + "/index.php", + "/catalog/item", + "/search", + "/news/article", + "/user/profile", + "/collection/view", + "/cars/used/sale", + "/forum/thread", + "/photo/album", + "/video/watch", + "/download/file", + "/api/v1/resource", + "/shop/product", + "/blog/post", +]; + +pub fn generate_url_data() -> VarBinArray { + generate_url_data_n(NUM_STRINGS) +} + +pub fn generate_url_data_n(n: usize) -> VarBinArray { + let mut rng = StdRng::seed_from_u64(42); + let total_weight: u32 = URL_DOMAINS.iter().map(|(_, w)| w).sum(); + let urls: Vec>> = (0..n) + .map(|_| { + let domain_roll = rng.random_range(0..total_weight); + let mut cumulative = 0u32; + let mut domain = URL_DOMAINS[0].0; + for &(d, w) in URL_DOMAINS { + cumulative += w; + if domain_roll < cumulative { + domain = d; + break; + } + } + let path = URL_PATHS[rng.random_range(0..URL_PATHS.len())]; + let query_id: u32 = rng.random_range(1..100_000); + let tab: u16 = rng.random_range(1..20); + let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}"); + Some(url.into_bytes().into_boxed_slice()) + }) + .collect(); + VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable)) +} + +pub fn make_fsst_urls(n: usize) -> FSSTArray { + let varbin = generate_url_data_n(n); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// ClickBench-style URL generator (longer URLs with query params, fragments) +// --------------------------------------------------------------------------- + +const CB_DOMAINS: &[&str] = &[ + "www.google.com", + "yandex.ru", + "mail.ru", + "vk.com", + "www.youtube.com", + "www.facebook.com", + "ok.ru", + "go.mail.ru", + "www.avito.ru", + "pogoda.yandex.ru", + "news.yandex.ru", + "maps.yandex.ru", + "market.yandex.ru", + "afisha.yandex.ru", + "auto.ru", + "www.kinopoisk.ru", + "www.ozon.ru", + "www.wildberries.ru", + "aliexpress.ru", + "lenta.ru", +]; + +const CB_PATHS: &[&str] = &[ + "/search", + "/catalog/electronics/smartphones", + "/product/item/123456789", + "/news/2024/03/15/article-about-technology", + "/user/profile/settings/notifications", + "/api/v2/catalog/search", + "/checkout/cart/summary", + "/blog/2024/how-to-optimize-database-queries-for-better-performance", + "/category/home-and-garden/furniture/tables", + "/", +]; + +const CB_PARAMS: &[&str] = &[ + "?utm_source=google&utm_medium=cpc&utm_campaign=spring_sale_2024&utm_content=banner_v2", + "?q=buy+smartphone+online+cheap+free+shipping&category=electronics&sort=price_asc&page=3", + "?ref=main_page_carousel_block_position_4&sessionid=abc123def456", + "?from=tabbar&clid=2270455&text=weather+forecast+tomorrow", + "?lr=213&msid=1234567890.12345&suggest_reqid=abcdef&csg=12345", + "", + "", + "", + "?page=1&per_page=20", + "?source=serp&forceshow=1", +]; + +const CB_FRAGMENTS: &[&str] = &[ + "", + "", + "", + "#section-reviews", + "#comments", + "#price-history", + "", + "", + "", + "", +]; + +pub fn generate_clickbench_urls(n: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(123); + (0..n) + .map(|_| { + let scheme = if rng.random_bool(0.7) { + "https" + } else { + "http" + }; + let domain = CB_DOMAINS[rng.random_range(0..CB_DOMAINS.len())]; + let path = CB_PATHS[rng.random_range(0..CB_PATHS.len())]; + let params = CB_PARAMS[rng.random_range(0..CB_PARAMS.len())]; + let fragment = CB_FRAGMENTS[rng.random_range(0..CB_FRAGMENTS.len())]; + format!("{scheme}://{domain}{path}{params}{fragment}") + }) + .collect() +} + +pub fn make_fsst_clickbench_urls(n: usize) -> FSSTArray { + let urls = generate_clickbench_urls(n); + let varbin = VarBinArray::from_iter( + urls.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// Short URL generator (simple URLs for contains benchmarks) +// --------------------------------------------------------------------------- + +const SHORT_URL_DOMAINS: &[&str] = &[ + "google.com", + "facebook.com", + "github.com", + "stackoverflow.com", + "amazon.com", + "reddit.com", + "twitter.com", + "youtube.com", + "wikipedia.org", + "microsoft.com", + "apple.com", + "netflix.com", + "linkedin.com", + "cloudflare.com", + "google.co.uk", + "docs.google.com", + "mail.google.com", + "maps.google.com", + "news.ycombinator.com", + "arxiv.org", +]; + +const SHORT_URL_PATHS: &[&str] = &[ + "/index.html", + "/about", + "/search?q=vortex", + "/user/profile/settings", + "/api/v2/data", + "/blog/2024/post", + "/products/item/12345", + "/docs/reference/guide", + "/login", + "/dashboard/analytics", +]; + +pub fn generate_short_urls(n: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(42); + (0..n) + .map(|_| { + let scheme = if rng.random_bool(0.8) { + "https" + } else { + "http" + }; + let domain = SHORT_URL_DOMAINS[rng.random_range(0..SHORT_URL_DOMAINS.len())]; + let path = SHORT_URL_PATHS[rng.random_range(0..SHORT_URL_PATHS.len())]; + format!("{scheme}://{domain}{path}") + }) + .collect() +} + +pub fn make_fsst_short_urls(n: usize) -> FSSTArray { + let urls = generate_short_urls(n); + let varbin = VarBinArray::from_iter( + urls.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// Log lines generator (Apache/nginx-style access logs) +// --------------------------------------------------------------------------- + +const LOG_METHODS: &[&str] = &["GET", "POST", "PUT", "DELETE", "PATCH", "HEAD"]; +const LOG_PATHS: &[&str] = &[ + "/api/v1/users", + "/api/v2/products/search", + "/healthcheck", + "/static/js/app.bundle.min.js", + "/favicon.ico", + "/login", + "/dashboard/analytics", + "/api/v1/orders/12345/status", + "/graphql", + "/metrics", +]; +const LOG_STATUS: &[u16] = &[ + 200, 200, 200, 200, 200, 201, 301, 302, 400, 403, 404, 500, 502, +]; +const LOG_IPS: &[&str] = &[ + "192.168.1.1", + "10.0.0.42", + "172.16.0.100", + "203.0.113.50", + "198.51.100.23", + "8.8.8.8", + "1.1.1.1", + "74.125.200.100", + "151.101.1.69", + "93.184.216.34", +]; +const LOG_UAS: &[&str] = &[ + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7)", + "curl/7.81.0", + "python-requests/2.28.1", + "Go-http-client/1.1", + "Googlebot/2.1 (+http://www.google.com/bot.html)", +]; + +pub fn generate_log_lines(n: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(456); + (0..n) + .map(|_| { + let ip = LOG_IPS[rng.random_range(0..LOG_IPS.len())]; + let method = LOG_METHODS[rng.random_range(0..LOG_METHODS.len())]; + let path = LOG_PATHS[rng.random_range(0..LOG_PATHS.len())]; + let status = LOG_STATUS[rng.random_range(0..LOG_STATUS.len())]; + let size = rng.random_range(100..50000); + let ua = LOG_UAS[rng.random_range(0..LOG_UAS.len())]; + format!( + r#"{ip} - - [15/Mar/2024:10:{:02}:{:02} +0000] "{method} {path} HTTP/1.1" {status} {size} "-" "{ua}""#, + rng.random_range(0..60u32), + rng.random_range(0..60u32), + ) + }) + .collect() +} + +pub fn make_fsst_log_lines(n: usize) -> FSSTArray { + let lines = generate_log_lines(n); + let varbin = VarBinArray::from_iter( + lines.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// JSON strings generator (typical API response payloads) +// --------------------------------------------------------------------------- + +const JSON_NAMES: &[&str] = &[ + "Alice", "Bob", "Charlie", "Diana", "Eve", "Frank", "Grace", "Hank", "Ivy", "Jack", +]; +const JSON_CITIES: &[&str] = &[ + "New York", + "London", + "Tokyo", + "Berlin", + "Sydney", + "Toronto", + "Paris", + "Mumbai", + "São Paulo", + "Seoul", +]; +const JSON_TAGS: &[&str] = &[ + "premium", + "verified", + "admin", + "moderator", + "subscriber", + "trial", + "enterprise", + "developer", +]; + +pub fn generate_json_strings(n: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(789); + (0..n) + .map(|_| { + let name = JSON_NAMES[rng.random_range(0..JSON_NAMES.len())]; + let city = JSON_CITIES[rng.random_range(0..JSON_CITIES.len())]; + let age = rng.random_range(18..80u32); + let tag1 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())]; + let tag2 = JSON_TAGS[rng.random_range(0..JSON_TAGS.len())]; + let id = rng.random_range(10000..99999u32); + format!( + r#"{{"id":{id},"name":"{name}","age":{age},"city":"{city}","tags":["{tag1}","{tag2}"],"active":true}}"# + ) + }) + .collect() +} + +pub fn make_fsst_json_strings(n: usize) -> FSSTArray { + let jsons = generate_json_strings(n); + let varbin = VarBinArray::from_iter( + jsons.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// File paths generator (Unix-style paths with various depths) +// --------------------------------------------------------------------------- + +const PATH_ROOTS: &[&str] = &[ + "/home/user", + "/var/log", + "/etc", + "/usr/local/bin", + "/opt/app", + "/tmp", + "/srv/www", + "/data/warehouse", +]; +const PATH_DIRS: &[&str] = &[ + "src", + "build", + "dist", + "node_modules", + "target/release", + "config", + ".cache", + "logs/2024", + "backups/daily", + "migrations", +]; +const PATH_FILES: &[&str] = &[ + "main.rs", + "index.ts", + "config.yaml", + "Dockerfile", + "schema.sql", + "app.log", + "data.parquet", + "model.onnx", + "README.md", + "package.json", +]; + +pub fn generate_file_paths(n: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(321); + (0..n) + .map(|_| { + let root = PATH_ROOTS[rng.random_range(0..PATH_ROOTS.len())]; + let dir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())]; + let file = PATH_FILES[rng.random_range(0..PATH_FILES.len())]; + let depth = rng.random_range(0..3u32); + let mut path = format!("{root}/{dir}"); + for _ in 0..depth { + let subdir = PATH_DIRS[rng.random_range(0..PATH_DIRS.len())]; + path.push('/'); + path.push_str(subdir); + } + path.push('/'); + path.push_str(file); + path + }) + .collect() +} + +pub fn make_fsst_file_paths(n: usize) -> FSSTArray { + let paths = generate_file_paths(n); + let varbin = VarBinArray::from_iter( + paths.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// Email addresses generator +// --------------------------------------------------------------------------- + +const EMAIL_USERS: &[&str] = &[ + "john.doe", + "jane.smith", + "admin", + "support", + "no-reply", + "sales.team", + "dev+test", + "marketing", + "info", + "contact.us", +]; +const EMAIL_DOMAINS: &[&str] = &[ + "gmail.com", + "yahoo.com", + "outlook.com", + "company.io", + "example.org", + "mail.ru", + "protonmail.com", + "fastmail.com", + "icloud.com", + "hey.com", +]; + +pub fn generate_emails(n: usize) -> Vec { + let mut rng = StdRng::seed_from_u64(654); + (0..n) + .map(|_| { + let user = EMAIL_USERS[rng.random_range(0..EMAIL_USERS.len())]; + let domain = EMAIL_DOMAINS[rng.random_range(0..EMAIL_DOMAINS.len())]; + let suffix = rng.random_range(0..1000u32); + format!("{user}{suffix}@{domain}") + }) + .collect() +} + +pub fn make_fsst_emails(n: usize) -> FSSTArray { + let emails = generate_emails(n); + let varbin = VarBinArray::from_iter( + emails.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +} + +// --------------------------------------------------------------------------- +// Rare match strings generator +// --------------------------------------------------------------------------- + +pub const RARE_NEEDLE: &[u8] = b"xyzzy"; + +pub fn generate_rare_match_strings(n: usize, match_rate: f64) -> Vec { + let mut rng = StdRng::seed_from_u64(999); + let charset: &[u8] = b"abcdefghijklmnopqrstuvwABCDEFGHIJKLMNOPQRSTUVW0123456789-_.:/"; + (0..n) + .map(|_| { + let len = rng.random_range(30..60); + let mut s: String = (0..len) + .map(|_| charset[rng.random_range(0..charset.len())] as char) + .collect(); + if rng.random_bool(match_rate) { + let pos = rng.random_range(0..s.len().saturating_sub(RARE_NEEDLE.len()) + 1); + s.replace_range( + pos..pos + RARE_NEEDLE.len().min(s.len() - pos), + std::str::from_utf8(RARE_NEEDLE).unwrap(), + ); + } + s + }) + .collect() +} + +pub fn make_fsst_rare_match(n: usize) -> FSSTArray { + let strings = generate_rare_match_strings(n, 0.00001); + let varbin = VarBinArray::from_iter( + strings.iter().map(|s| Some(s.as_str())), + DType::Utf8(Nullability::NonNullable), + ); + let compressor = fsst_train_compressor(&varbin); + fsst_compress(varbin, &compressor) +}