Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@ rust-version = "1.90"
version = "0.1.0"

[workspace.dependencies]
aho-corasick = "1.1.3"
anyhow = "1.0.97"
arbitrary = "1.3.2"
arc-swap = "1.8"
Expand Down Expand Up @@ -121,6 +122,7 @@ cudarc = { version = "0.18.2", features = [
"cuda-12050",
] }
custom-labels = "0.4.4"
daachorse = "1.0.0"
dashmap = "6.1.0"
datafusion = { version = "52", default-features = false, features = ["sql"] }
datafusion-catalog = { version = "52" }
Expand Down Expand Up @@ -155,6 +157,7 @@ indicatif = "0.18.0"
insta = "1.43"
inventory = "0.3.20"
itertools = "0.14.0"
jetscii = "0.5.3"
jiff = "0.2.0"
kanal = "0.1.1"
lending-iterator = "0.1.7"
Expand All @@ -163,6 +166,7 @@ libloading = "0.8"
liblzma = "0.4"
log = { version = "0.4.21" }
loom = { version = "0.7", features = ["checkpoint"] }
memchr = "2.8.0"
memmap2 = "0.9.5"
mimalloc = "0.1.42"
moka = { version = "0.12.10", default-features = false }
Expand Down Expand Up @@ -196,6 +200,7 @@ rand = "0.9.0"
rand_distr = "0.5"
ratatui = { version = "0.30", default-features = false }
regex = "1.11.0"
regex-automata = "0.4"
reqwest = { version = "0.12.4", features = [
"charset",
"http2",
Expand Down
7 changes: 7 additions & 0 deletions encodings/fsst/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,17 @@ vortex-array = { workspace = true, features = ["_test-harness"] }
[[bench]]
name = "fsst_compress"
harness = false
required-features = ["_test-harness"]

[[bench]]
name = "fsst_contains"
harness = false
required-features = ["_test-harness"]

[[bench]]
name = "fsst_url_compare"
harness = false
required-features = ["_test-harness"]

[[bench]]
name = "chunked_dict_fsst_builder"
Expand Down
112 changes: 112 additions & 0 deletions encodings/fsst/benches/fsst_contains.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// SPDX-License-Identifier: Apache-2.0
// SPDX-FileCopyrightText: Copyright the Vortex contributors

#![allow(clippy::unwrap_used)]

use std::fmt;
use std::sync::LazyLock;

use divan::Bencher;
use vortex_array::Canonical;
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::scalar_fn::ScalarFnArrayExt;
use vortex_array::scalar_fn::fns::like::Like;
use vortex_array::scalar_fn::fns::like::LikeOptions;
use vortex_array::session::ArraySession;
use vortex_fsst::FSSTArray;
use vortex_fsst::test_utils::NUM_STRINGS;
use vortex_fsst::test_utils::make_fsst_clickbench_urls;
use vortex_fsst::test_utils::make_fsst_emails;
use vortex_fsst::test_utils::make_fsst_file_paths;
use vortex_fsst::test_utils::make_fsst_json_strings;
use vortex_fsst::test_utils::make_fsst_log_lines;
use vortex_fsst::test_utils::make_fsst_rare_match;
use vortex_fsst::test_utils::make_fsst_short_urls;
use vortex_session::VortexSession;

fn main() {
divan::main();
}

static SESSION: LazyLock<VortexSession> =
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());

const N: usize = NUM_STRINGS;

static FSST_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_short_urls(N));
static FSST_CB_URLS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_clickbench_urls(N));
static FSST_LOG_LINES: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_log_lines(N));
static FSST_JSON_STRINGS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_json_strings(N));
static FSST_FILE_PATHS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_file_paths(N));
static FSST_EMAILS: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_emails(N));
static FSST_RARE_MATCH: LazyLock<FSSTArray> = LazyLock::new(|| make_fsst_rare_match(N));

enum Dataset {
Urls,
Cb,
Log,
Json,
Path,
Email,
Rare,
}

impl fmt::Display for Dataset {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Urls => f.write_str("urls"),
Self::Cb => f.write_str("cb"),
Self::Log => f.write_str("log"),
Self::Json => f.write_str("json"),
Self::Path => f.write_str("path"),
Self::Email => f.write_str("email"),
Self::Rare => f.write_str("rare"),
}
}
}

impl Dataset {
fn fsst_array(&self) -> &'static FSSTArray {
match self {
Self::Urls => &FSST_URLS,
Self::Cb => &FSST_CB_URLS,
Self::Log => &FSST_LOG_LINES,
Self::Json => &FSST_JSON_STRINGS,
Self::Path => &FSST_FILE_PATHS,
Self::Email => &FSST_EMAILS,
Self::Rare => &FSST_RARE_MATCH,
}
}

fn pattern(&self) -> &'static str {
match self {
Self::Urls => "%google%",
Self::Cb => "%yandex%",
Self::Log => "%Googlebot%",
Self::Json => "%enterprise%",
Self::Path => "%target/release%",
Self::Email => "%gmail%",
Self::Rare => "%xyzzy%",
}
}
}

#[divan::bench(args = [
Dataset::Urls, Dataset::Cb, Dataset::Log, Dataset::Json,
Dataset::Path, Dataset::Email, Dataset::Rare,
])]
fn fsst_like(bencher: Bencher, dataset: &Dataset) {
let fsst = dataset.fsst_array();
let len = fsst.len();
let arr = fsst.clone().into_array();
let pattern = ConstantArray::new(dataset.pattern(), len).into_array();
bencher.bench_local(|| {
Like.try_new_array(len, LikeOptions::default(), [arr.clone(), pattern.clone()])
.unwrap()
.into_array()
.execute::<Canonical>(&mut SESSION.create_execution_ctx())
.unwrap()
});
}
80 changes: 5 additions & 75 deletions encodings/fsst/benches/fsst_url_compare.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,13 @@
use std::sync::LazyLock;

use divan::Bencher;
use rand::Rng;
use rand::SeedableRng;
use rand::rngs::StdRng;
use vortex_array::IntoArray;
use vortex_array::RecursiveCanonical;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::ConstantArray;
use vortex_array::arrays::VarBinArray;
use vortex_array::builtins::ArrayBuiltins;
use vortex_array::compute::warm_up_vtables;
use vortex_array::dtype::DType;
use vortex_array::dtype::Nullability;
use vortex_array::expr::like;
use vortex_array::expr::lit;
use vortex_array::expr::root;
Expand All @@ -26,6 +21,10 @@ use vortex_array::scalar_fn::fns::operators::Operator;
use vortex_array::session::ArraySession;
use vortex_fsst::fsst_compress;
use vortex_fsst::fsst_train_compressor;
use vortex_fsst::test_utils::HIGH_MATCH_DOMAIN;
use vortex_fsst::test_utils::LOW_MATCH_DOMAIN;
use vortex_fsst::test_utils::NUM_STRINGS;
use vortex_fsst::test_utils::generate_url_data;
use vortex_session::VortexSession;

fn main() {
Expand All @@ -36,76 +35,7 @@ fn main() {
static SESSION: LazyLock<VortexSession> =
LazyLock::new(|| VortexSession::empty().with::<ArraySession>());

const NUM_URLS: usize = 100_000;

/// A high-frequency domain that appears in ~50% of generated URLs.
const HIGH_MATCH_DOMAIN: &str = "smeshariki.ru";

/// A low-frequency domain that appears in ~1% of generated URLs.
const LOW_MATCH_DOMAIN: &str = "rare-example-domain.com";

// Domains modeled after real ClickBench URL distributions.
const DOMAINS: &[(&str, u32)] = &[
("smeshariki.ru", 500), // ~50%
("auto.ru", 150), // ~15%
("komme.ru", 100), // ~10%
("yandex.ru", 80), // ~8%
("mail.ru", 60), // ~6%
("livejournal.com", 40), // ~4%
("vk.com", 30), // ~3%
("avito.ru", 20), // ~2%
("kinopoisk.ru", 10), // ~1%
("rare-example-domain.com", 10), // ~1%
];

const PATHS: &[&str] = &[
"/GameMain.aspx",
"/index.php",
"/catalog/item",
"/search",
"/news/article",
"/user/profile",
"/collection/view",
"/cars/used/sale",
"/forum/thread",
"/photo/album",
"/video/watch",
"/download/file",
"/api/v1/resource",
"/shop/product",
"/blog/post",
];

/// Generate 100k realistic ClickBench-style URLs.
fn generate_url_data() -> VarBinArray {
let mut rng = StdRng::seed_from_u64(42);

// Build a weighted domain lookup.
let total_weight: u32 = DOMAINS.iter().map(|(_, w)| w).sum();
let urls: Vec<Option<Box<[u8]>>> = (0..NUM_URLS)
.map(|_| {
let domain_roll = rng.random_range(0..total_weight);
let mut cumulative = 0u32;
let mut domain = DOMAINS[0].0;
for &(d, w) in DOMAINS {
cumulative += w;
if domain_roll < cumulative {
domain = d;
break;
}
}

let path = PATHS[rng.random_range(0..PATHS.len())];
let query_id: u32 = rng.random_range(1..100_000);
let tab: u16 = rng.random_range(1..20);

let url = format!("http://{domain}{path}?id={query_id}&tab={tab}#ref={query_id}");
Some(url.into_bytes().into_boxed_slice())
})
.collect();

VarBinArray::from_iter(urls, DType::Utf8(Nullability::NonNullable))
}
const NUM_URLS: usize = NUM_STRINGS;

static URL_DATA: LazyLock<VarBinArray> = LazyLock::new(generate_url_data);

Expand Down
Loading
Loading