diff --git a/.github/scripts/run-sql-bench.sh b/.github/scripts/run-sql-bench.sh index 96c2e2f19dc..7efda7b3c61 100755 --- a/.github/scripts/run-sql-bench.sh +++ b/.github/scripts/run-sql-bench.sh @@ -2,8 +2,8 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright the Vortex contributors # -# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench) for the given targets. -# This script is used by the sql-benchmarks.yml workflow. +# Runs SQL benchmarks (datafusion-bench, duckdb-bench, lance-bench, clickhouse-bench) +# for the given targets. This script is used by the sql-benchmarks.yml workflow. # # Usage: # run-sql-bench.sh [options] @@ -11,13 +11,13 @@ # Arguments: # subcommand The benchmark subcommand (e.g., tpch, clickbench, tpcds) # targets Comma-separated list of engine:format pairs -# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet") +# (e.g., "datafusion:parquet,datafusion:vortex,duckdb:parquet,clickhouse:parquet") # # Options: # --scale-factor Scale factor for the benchmark (e.g., 1.0, 10.0) # --iterations Number of iterations to pass to each benchmark binary # --remote-storage Remote storage URL (e.g., s3://bucket/path/) -# If provided, runs in remote mode (no lance support). +# If provided, runs in remote mode (no lance/clickhouse support). # --benchmark-id Benchmark ID for error messages (e.g., tpch-s3) set -Eeu -o pipefail @@ -71,6 +71,13 @@ if $is_remote && echo "$targets" | grep -q 'lance'; then exit 1 fi +# ClickHouse on remote storage is not supported. clickhouse-local reads local files only. +if $is_remote && echo "$targets" | grep -q 'clickhouse:'; then + echo "ERROR: ClickHouse benchmarks are not supported for remote storage." + echo "Remove 'clickhouse:' targets for benchmark '${benchmark_id:-unknown}'." + exit 1 +fi + # Extract formats for each engine from the targets string. # Example input: "datafusion:parquet,datafusion:vortex,datafusion:lance,duckdb:parquet" # @@ -84,6 +91,7 @@ fi df_formats=$(echo "$targets" | tr ',' '\n' | (grep '^datafusion:' | grep -v ':lance$' || true) | sed 's/datafusion://' | tr '\n' ',' | sed 's/,$//') ddb_formats=$(echo "$targets" | tr ',' '\n' | (grep '^duckdb:' || true) | sed 's/duckdb://' | tr '\n' ',' | sed 's/,$//') has_lance=$(echo "$targets" | grep -q 'datafusion:lance' && echo "true" || echo "false") +has_clickhouse=$(echo "$targets" | grep -q 'clickhouse:' && echo "true" || echo "false") # Build options string. opts="" @@ -136,3 +144,14 @@ if ! $is_remote && [[ "$has_lance" == "true" ]] && [[ -f "target/release_debug/l cat lance-results.json >> results.json fi + +# ClickHouse-bench only runs for local benchmarks (clickhouse-local reads local files). +if ! $is_remote && [[ "$has_clickhouse" == "true" ]] && [[ -f "target/release_debug/clickhouse-bench" ]]; then + # shellcheck disable=SC2086 + target/release_debug/clickhouse-bench "$subcommand" \ + -d gh-json \ + $opts \ + -o ch-results.json + + cat ch-results.json >> results.json +fi diff --git a/.github/workflows/bench.yml b/.github/workflows/bench.yml index ebeb429849b..69c7dbe6c55 100644 --- a/.github/workflows/bench.yml +++ b/.github/workflows/bench.yml @@ -121,7 +121,7 @@ jobs: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb", + "targets": "datafusion:parquet,datafusion:vortex,datafusion:vortex-compact,datafusion:lance,duckdb:parquet,duckdb:vortex,duckdb:vortex-compact,duckdb:duckdb,clickhouse:parquet", "build_lance": true }, { diff --git a/.github/workflows/sql-benchmarks.yml b/.github/workflows/sql-benchmarks.yml index 08afe175341..044efb6028f 100644 --- a/.github/workflows/sql-benchmarks.yml +++ b/.github/workflows/sql-benchmarks.yml @@ -21,7 +21,7 @@ on: "id": "clickbench-nvme", "subcommand": "clickbench", "name": "Clickbench on NVME", - "targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb" + "targets": "datafusion:parquet,datafusion:vortex,duckdb:parquet,duckdb:vortex,duckdb:duckdb,clickhouse:parquet" }, { "id": "tpch-nvme", @@ -135,6 +135,16 @@ jobs: - uses: ./.github/actions/system-info + - name: Install ClickHouse + if: contains(matrix.targets, 'clickhouse:') + env: + CLICKHOUSE_VERSION: "25.8.18.1" + run: | + wget -qO- "https://github.com/ClickHouse/ClickHouse/releases/download/v${CLICKHOUSE_VERSION}-lts/clickhouse-common-static-${CLICKHOUSE_VERSION}-amd64.tgz" | tar xz + cp clickhouse-common-static-${CLICKHOUSE_VERSION}/usr/bin/clickhouse . + chmod +x clickhouse + echo "CLICKHOUSE_BINARY=$PWD/clickhouse" >> $GITHUB_ENV + - name: Build binaries shell: bash env: @@ -144,6 +154,9 @@ jobs: if [ "${{ matrix.build_lance }}" = "true" ]; then packages="$packages --bin lance-bench" fi + if echo "${{ matrix.targets }}" | grep -q 'clickhouse:'; then + packages="$packages --bin clickhouse-bench" + fi cargo build $packages --profile release_debug - name: Generate data diff --git a/Cargo.lock b/Cargo.lock index 8a09716070c..3c509db29ea 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -880,19 +880,20 @@ dependencies = [ [[package]] name = "borsh" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d1da5ab77c1437701eeff7c88d968729e7766172279eab0676857b3d63af7a6f" +checksum = "cfd1e3f8955a5d7de9fab72fc8373fade9fb8a703968cb200ae3dc6cf08e185a" dependencies = [ "borsh-derive", + "bytes", "cfg_aliases", ] [[package]] name = "borsh-derive" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0686c856aa6aac0c4498f936d7d6a02df690f614c03e4d906d1018062b5c5e2c" +checksum = "bfcfdc083699101d5a7965e49925975f2f55060f94f9a05e7187be95d530ca59" dependencies = [ "once_cell", "proc-macro-crate", @@ -1248,6 +1249,18 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c8d4a3bb8b1e0c1050499d1815f5ab16d04f0959b233085fb31653fbfc9d98f9" +[[package]] +name = "clickhouse-bench" +version = "0.1.0" +dependencies = [ + "anyhow", + "clap", + "parking_lot", + "tokio", + "tracing", + "vortex-bench", +] + [[package]] name = "cmake" version = "0.1.57" @@ -9223,7 +9236,7 @@ dependencies = [ "toml_datetime 0.7.5+spec-1.1.0", "toml_parser", "toml_writer", - "winnow", + "winnow 0.7.15", ] [[package]] @@ -9237,39 +9250,39 @@ dependencies = [ [[package]] name = "toml_datetime" -version = "1.0.0+spec-1.1.0" +version = "1.0.1+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "32c2555c699578a4f59f0cc68e5116c8d7cabbd45e1409b989d4be085b53f13e" +checksum = "9b320e741db58cac564e26c607d3cc1fdc4a88fd36c879568c07856ed83ff3e9" dependencies = [ "serde_core", ] [[package]] name = "toml_edit" -version = "0.25.4+spec-1.1.0" +version = "0.25.5+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7193cbd0ce53dc966037f54351dbbcf0d5a642c7f0038c382ef9e677ce8c13f2" +checksum = "8ca1a40644a28bce036923f6a431df0b34236949d111cc07cb6dca830c9ef2e1" dependencies = [ "indexmap", - "toml_datetime 1.0.0+spec-1.1.0", + "toml_datetime 1.0.1+spec-1.1.0", "toml_parser", - "winnow", + "winnow 1.0.0", ] [[package]] name = "toml_parser" -version = "1.0.9+spec-1.1.0" +version = "1.0.10+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "702d4415e08923e7e1ef96cd5727c0dfed80b4d2fa25db9647fe5eb6f7c5a4c4" +checksum = "7df25b4befd31c4816df190124375d5a20c6b6921e2cad937316de3fccd63420" dependencies = [ - "winnow", + "winnow 1.0.0", ] [[package]] name = "toml_writer" -version = "1.0.6+spec-1.1.0" +version = "1.0.7+spec-1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ab16f14aed21ee8bfd8ec22513f7287cd4a91aa92e44edfe2c17ddd004e92607" +checksum = "f17aaa1c6e3dc22b1da4b6bba97d066e354c7945cac2f7852d4e4e7ca7a6b56d" [[package]] name = "tonic" @@ -11227,6 +11240,12 @@ name = "winnow" version = "0.7.15" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "df79d97927682d2fd8adb29682d1140b343be4ac0f08fd68b7765d9c059d3945" + +[[package]] +name = "winnow" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a90e88e4667264a994d34e6d1ab2d26d398dcdca8b7f52bec8668957517fc7d8" dependencies = [ "memchr", ] diff --git a/Cargo.toml b/Cargo.toml index 1f21bdf4f21..4e44755c566 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -50,6 +50,7 @@ members = [ "encodings/zstd", "encodings/bytebool", # Benchmarks + "benchmarks/clickhouse-bench", "benchmarks/lance-bench", "benchmarks/compress-bench", "benchmarks/datafusion-bench", diff --git a/benchmarks/clickhouse-bench/Cargo.toml b/benchmarks/clickhouse-bench/Cargo.toml new file mode 100644 index 00000000000..f85ff2ecd13 --- /dev/null +++ b/benchmarks/clickhouse-bench/Cargo.toml @@ -0,0 +1,23 @@ +[package] +name = "clickhouse-bench" +description = "ClickHouse (clickhouse-local) benchmark runner for Vortex" +authors.workspace = true +edition.workspace = true +homepage.workspace = true +license.workspace = true +readme.workspace = true +repository.workspace = true +rust-version.workspace = true +version.workspace = true +publish = false + +[dependencies] +anyhow = { workspace = true } +clap = { workspace = true, features = ["derive"] } +parking_lot = { workspace = true } +tokio = { workspace = true } +tracing = { workspace = true } +vortex-bench = { workspace = true } + +[lints] +workspace = true diff --git a/benchmarks/clickhouse-bench/build.rs b/benchmarks/clickhouse-bench/build.rs new file mode 100644 index 00000000000..7ef98c8e48d --- /dev/null +++ b/benchmarks/clickhouse-bench/build.rs @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! Build script that exports the ClickHouse binary path. +//! +//! Resolution order: +//! 1. `CLICKHOUSE_BINARY` env var — use as-is. +//! 2. Falls back to `"clickhouse"` (i.e., resolve from `$PATH` at runtime). +//! +//! Users must install ClickHouse themselves for local runs. +//! In CI, it is installed via the workflow before the benchmark step. + +fn main() { + println!("cargo:rerun-if-env-changed=CLICKHOUSE_BINARY"); + + let binary = std::env::var("CLICKHOUSE_BINARY").unwrap_or_else(|_| "clickhouse".to_string()); + println!("cargo:rustc-env=CLICKHOUSE_BINARY={binary}"); +} diff --git a/benchmarks/clickhouse-bench/src/lib.rs b/benchmarks/clickhouse-bench/src/lib.rs new file mode 100644 index 00000000000..420d88c6a9a --- /dev/null +++ b/benchmarks/clickhouse-bench/src/lib.rs @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +//! ClickHouse Local context for benchmarks. +//! +//! Spawns a fresh `clickhouse-local` process for each query execution. Setup SQL +//! (CREATE VIEW statements) is prepended to each query so that table definitions +//! are available, but the setup cost is excluded from query timings by measuring +//! only the wall-clock time from the first byte of query output to process exit. +//! +//! ## Why per-query processes? +//! +//! `clickhouse-local` in non-interactive (piped stdin) mode reads **all** of stdin +//! before executing any queries. This makes a persistent-process + delimiter protocol +//! impossible — the process blocks on stdin, never producing output until EOF. Spawning +//! a fresh process per query avoids this by closing stdin immediately after writing the +//! full SQL batch (setup + query), which triggers execution. +//! +//! Process spawn overhead is negligible (~1ms) compared to query execution times. +//! +//! The ClickHouse binary is resolved at build time via `build.rs`: +//! 1. `CLICKHOUSE_BINARY` env var — use the specified path. +//! 2. Falls back to `"clickhouse"` — resolved from `$PATH` at runtime. +//! +//! For local runs, install ClickHouse manually (e.g., `brew install clickhouse` +//! or download from ). +//! In CI, it is installed by the workflow before the benchmark step. + +use std::io::BufRead; +use std::io::BufReader; +use std::io::Read; +use std::io::Write; +use std::path::Path; +use std::path::PathBuf; +use std::process::Command; +use std::process::Stdio; +use std::time::Duration; +use std::time::Instant; + +use anyhow::Context; +use anyhow::Result; +use tracing::trace; +use vortex_bench::Benchmark; +use vortex_bench::Format; + +/// Path to the ClickHouse binary, set by build.rs at compile time. +/// +/// This is either the value of the `CLICKHOUSE_BINARY` env var at build time, +/// or `"clickhouse"` (resolved from `$PATH` at runtime). +const CLICKHOUSE_BINARY: &str = env!("CLICKHOUSE_BINARY"); + +/// A client that spawns `clickhouse-local` processes for running SQL benchmarks. +/// +/// Setup SQL (CREATE VIEW) is stored at construction time and prepended to each +/// query execution. Each `execute_query` call spawns a fresh process, writes the +/// full SQL batch (setup + query), closes stdin, and reads the output. +pub struct ClickHouseClient { + /// Path to the ClickHouse binary. + binary: PathBuf, + /// Setup SQL statements (CREATE VIEW) to prepend to each query. + setup_sql: Vec, +} + +impl ClickHouseClient { + /// Create a new client that will use `clickhouse-local` for query execution. + /// + /// Validates that the binary is available and builds setup SQL (CREATE VIEW + /// statements) from the benchmark's table specs. Only Parquet format is supported. + pub fn new(benchmark: &dyn Benchmark, format: Format) -> Result { + if format != Format::Parquet { + anyhow::bail!("clickhouse-bench only supports Parquet format, got {format}"); + } + + let binary = PathBuf::from(CLICKHOUSE_BINARY); + Self::verify_binary(&binary)?; + tracing::info!(binary = %binary.display(), "Using clickhouse-local"); + + let setup_sql = Self::build_setup_sql(benchmark, format)?; + + Ok(Self { binary, setup_sql }) + } + + /// Check that the ClickHouse binary is available. + /// + /// For absolute paths, checks that the file exists on disk. + /// For bare names (e.g., `"clickhouse"`), tries to invoke it to verify it's resolvable. + fn verify_binary(binary: &Path) -> Result<()> { + if binary.is_absolute() { + anyhow::ensure!( + binary.exists(), + "ClickHouse binary not found at '{path}'. \ + Set CLICKHOUSE_BINARY env var to the correct path, or install ClickHouse \ + and ensure it is on $PATH.", + path = binary.display() + ); + } + + let output = Command::new(binary.as_os_str()) + .args(["local", "--version"]) + .output() + .with_context(|| { + format!( + "ClickHouse binary '{name}' not found on $PATH. \ + Install ClickHouse (https://clickhouse.com/docs/en/install) or set \ + CLICKHOUSE_BINARY env var to an absolute path before building.", + name = binary.display() + ) + })?; + + anyhow::ensure!( + output.status.success(), + "ClickHouse binary at '{name}' failed to run: {stderr}", + name = binary.display(), + stderr = String::from_utf8_lossy(&output.stderr) + ); + + let version = String::from_utf8_lossy(&output.stdout); + tracing::debug!(version = version.trim(), "Verified clickhouse binary"); + + Ok(()) + } + + /// Build `CREATE VIEW ... AS SELECT * FROM file(...)` statements for all tables. + fn build_setup_sql(benchmark: &dyn Benchmark, format: Format) -> Result> { + let data_url = benchmark.data_url(); + let base_dir = if data_url.scheme() == "file" { + data_url + .to_file_path() + .map_err(|_| anyhow::anyhow!("Invalid file URL: {data_url}"))? + } else { + anyhow::bail!("clickhouse-bench only supports local file:// data URLs"); + }; + + let format_dir = base_dir.join(format.name()); + if !format_dir.exists() { + anyhow::bail!( + "Data directory does not exist: {}. Run data generation first.", + format_dir.display() + ); + } + + let mut stmts = Vec::new(); + for table_spec in benchmark.table_specs() { + let name = table_spec.name; + let pattern = benchmark + .pattern(name, format) + .map(|p| p.to_string()) + .unwrap_or_else(|| format!("*.{}", format.ext())); + + let data_path = format!("{}/{}", format_dir.display(), pattern); + + tracing::info!( + table = name, + path = %data_path, + "Registering ClickHouse table" + ); + + stmts.push(format!( + "CREATE VIEW IF NOT EXISTS {name} AS \ + SELECT * FROM file('{data_path}', Parquet);" + )); + } + + Ok(stmts) + } + + /// Execute a SQL query, returning `(row_count, timing)`. + /// + /// Spawns a fresh `clickhouse-local` process with the setup SQL prepended to + /// the query. Timing covers only query execution (from process spawn through + /// output collection). + pub fn execute_query(&mut self, query: &str) -> Result<(usize, Option)> { + trace!("execute clickhouse query: {query}"); + + let mut query_str = query.to_string(); + if !query_str.trim_end().ends_with(';') { + query_str.push(';'); + } + + // Build the full SQL batch: setup statements + query. + let mut full_sql = String::new(); + for stmt in &self.setup_sql { + full_sql.push_str(stmt); + full_sql.push('\n'); + } + full_sql.push_str(&query_str); + full_sql.push('\n'); + + let time_instant = Instant::now(); + + let mut child = Command::new(self.binary.as_os_str()) + .args(["local", "--format", "TabSeparated"]) + .stdin(Stdio::piped()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()) + .spawn() + .context("Failed to spawn clickhouse-local")?; + + // Write all SQL and close stdin to trigger execution. + { + let mut stdin = child.stdin.take().context("Failed to open stdin")?; + stdin + .write_all(full_sql.as_bytes()) + .context("Failed to write SQL to clickhouse-local")?; + stdin.flush().context("Failed to flush stdin")?; + // stdin is dropped here, closing the pipe and signaling EOF. + } + + // Read all output lines from stdout. + let stdout = child.stdout.take().context("Failed to open stdout")?; + let reader = BufReader::new(stdout); + let mut row_count = 0usize; + for line in reader.lines() { + let line = line.context("Failed to read from clickhouse-local stdout")?; + if !line.trim().is_empty() { + row_count += 1; + } + } + + let status = child + .wait() + .context("Failed to wait for clickhouse-local")?; + + let query_time = time_instant.elapsed(); + + if !status.success() { + let stderr = match child.stderr.take() { + Some(s) => { + let mut buf = String::new(); + BufReader::new(s).read_to_string(&mut buf).ok(); + buf + } + None => String::new(), + }; + anyhow::bail!("clickhouse-local exited with {status}. stderr:\n{stderr}",); + } + + Ok((row_count, Some(query_time))) + } +} diff --git a/benchmarks/clickhouse-bench/src/main.rs b/benchmarks/clickhouse-bench/src/main.rs new file mode 100644 index 00000000000..d705b23eb22 --- /dev/null +++ b/benchmarks/clickhouse-bench/src/main.rs @@ -0,0 +1,125 @@ +// SPDX-License-Identifier: Apache-2.0 +// SPDX-FileCopyrightText: Copyright the Vortex contributors + +use std::path::PathBuf; + +use clap::Parser; +use clickhouse_bench::ClickHouseClient; +use tokio::runtime::Runtime; +use vortex_bench::BenchmarkArg; +use vortex_bench::Engine; +use vortex_bench::Format; +use vortex_bench::Opt; +use vortex_bench::Opts; +use vortex_bench::create_benchmark; +use vortex_bench::create_output_writer; +use vortex_bench::display::DisplayFormat; +use vortex_bench::runner::BenchmarkMode; +use vortex_bench::runner::BenchmarkQueryResult; +use vortex_bench::runner::SqlBenchmarkRunner; +use vortex_bench::runner::filter_queries; +use vortex_bench::setup_logging_and_tracing; + +/// ClickHouse (clickhouse-local) benchmark runner. +/// +/// Runs queries against Parquet data using clickhouse-local as a performance baseline. +/// This allows comparing ClickHouse's native Parquet reading performance against other engines +/// (DuckDB, DataFusion) on the same hardware and dataset. +#[derive(Parser)] +struct Args { + #[arg(value_enum)] + benchmark: BenchmarkArg, + + #[arg(short, long, default_value_t = 5)] + iterations: usize, + + #[arg(short, long)] + verbose: bool, + + #[arg(long)] + tracing: bool, + + #[arg(short, long, default_value_t, value_enum)] + display_format: DisplayFormat, + + #[arg(short, long, value_delimiter = ',')] + queries: Option>, + + #[arg(short, long, value_delimiter = ',')] + exclude_queries: Option>, + + #[arg(short)] + output_path: Option, + + #[arg(long, default_value_t = false)] + track_memory: bool, + + #[arg(long, default_value_t = false)] + hide_progress_bar: bool, + + #[arg(long = "opt", value_delimiter = ',', value_parser = clap::value_parser!(Opt))] + options: Vec, +} + +struct ClickHouseQueryResult { + row_count: usize, +} + +impl BenchmarkQueryResult for ClickHouseQueryResult { + fn row_count(&self) -> usize { + self.row_count + } + + fn display(self) -> String { + format!("{} rows", self.row_count) + } +} + +fn main() -> anyhow::Result<()> { + let args = Args::parse(); + let opts = Opts::from(args.options); + + setup_logging_and_tracing(args.verbose, args.tracing)?; + + let benchmark = create_benchmark(args.benchmark, &opts)?; + + let filtered_queries = filter_queries( + benchmark.queries()?, + args.queries.as_ref(), + args.exclude_queries.as_ref(), + ); + + // Generate base Parquet data if needed. + if benchmark.data_url().scheme() == "file" { + let runtime = Runtime::new()?; + runtime.block_on(async { benchmark.generate_base_data().await })?; + } + + let formats = vec![Format::Parquet]; + + let mut runner = SqlBenchmarkRunner::new( + benchmark.as_ref(), + Engine::ClickHouse, + formats, + args.track_memory, + args.hide_progress_bar, + )?; + + runner.run_all( + &filtered_queries, + BenchmarkMode::Run { + iterations: args.iterations, + }, + |format| ClickHouseClient::new(benchmark.as_ref(), format), + |ctx, _query_idx, _format, query| { + let (row_count, duration) = ctx.execute_query(query)?; + Ok((duration, ClickHouseQueryResult { row_count })) + }, + )?; + + let benchmark_id = format!("clickhouse-{}", benchmark.dataset_name()); + let writer = create_output_writer(&args.display_format, args.output_path, &benchmark_id)?; + runner.export_to(&args.display_format, writer)?; + + Ok(()) +} diff --git a/vortex-bench/src/clickbench/benchmark.rs b/vortex-bench/src/clickbench/benchmark.rs index 5e14cbcf40e..a0dcb4ea44f 100644 --- a/vortex-bench/src/clickbench/benchmark.rs +++ b/vortex-bench/src/clickbench/benchmark.rs @@ -1,14 +1,12 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use std::env; use std::fs; -use std::path::Path; +use std::path::PathBuf; use anyhow::Result; use reqwest::Client; use url::Url; -use vortex::error::VortexExpect; use crate::Benchmark; use crate::BenchmarkDataset; @@ -37,14 +35,21 @@ impl ClickBenchBenchmark { }) } + /// Returns the path to the queries file. + fn queries_file_path(&self) -> PathBuf { + if let Some(file) = &self.queries_file { + return file.into(); + } + let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR")); + manifest_dir.join("clickbench_queries.sql") + } + fn create_data_url(remote_data_dir: &Option, flavor: Flavor) -> Result { match remote_data_dir { None => { let basepath = format!("clickbench_{flavor}").to_data_path(); - Ok(Url::parse(&format!( - "file:{}/", - basepath.to_str().vortex_expect("path should be utf8") - ))?) + Url::from_directory_path(basepath) + .map_err(|_| anyhow::anyhow!("Failed to convert ClickBench data path to URL")) } Some(remote_data_dir) => { if !remote_data_dir.ends_with("/") { @@ -69,10 +74,7 @@ impl ClickBenchBenchmark { #[async_trait::async_trait] impl Benchmark for ClickBenchBenchmark { fn queries(&self) -> Result> { - let queries_filepath = match &self.queries_file { - Some(file) => file.into(), - None => Path::new(env!("CARGO_MANIFEST_DIR")).join("clickbench_queries.sql"), - }; + let queries_filepath = self.queries_file_path(); Ok(fs::read_to_string(queries_filepath)? .split(';') diff --git a/vortex-bench/src/lib.rs b/vortex-bench/src/lib.rs index 6dad0f0f6a1..8be4c6bcea8 100644 --- a/vortex-bench/src/lib.rs +++ b/vortex-bench/src/lib.rs @@ -206,6 +206,9 @@ pub enum Engine { #[clap(name = "duckdb")] #[serde(rename = "duckdb")] DuckDB, + #[clap(name = "clickhouse")] + #[serde(rename = "clickhouse")] + ClickHouse, } impl Display for Engine { @@ -213,6 +216,7 @@ impl Display for Engine { match self { Engine::DataFusion => write!(f, "datafusion"), Engine::DuckDB => write!(f, "duckdb"), + Engine::ClickHouse => write!(f, "clickhouse"), Engine::Vortex => write!(f, "vortex"), Engine::Arrow => write!(f, "arrow"), } diff --git a/vortex-duckdb/cpp/replacement_scan.cpp b/vortex-duckdb/cpp/replacement_scan.cpp index 62553f1ce4c..14b9bb7812a 100644 --- a/vortex-duckdb/cpp/replacement_scan.cpp +++ b/vortex-duckdb/cpp/replacement_scan.cpp @@ -34,7 +34,7 @@ VortexScanReplacement(duckdb::ClientContext &context, table_function->alias = fs.ExtractBaseName(table_name); } - return table_function; + return duckdb::unique_ptr(table_function.release()); } } // namespace vortex