diff --git a/Cargo.lock b/Cargo.lock index 38a21cae5d4..0c1d847db67 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -10015,7 +10015,6 @@ dependencies = [ "kanal", "num-traits", "object_store", - "once_cell", "parking_lot", "paste", "reqwest", @@ -10028,7 +10027,6 @@ dependencies = [ "vortex-runend", "vortex-sequence", "vortex-utils", - "walkdir", "zip", ] diff --git a/Cargo.toml b/Cargo.toml index a025edb2769..93dff45e3cc 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -237,7 +237,6 @@ tracing-perfetto = "0.1.5" tracing-subscriber = "0.3" url = "2.5.7" uuid = { version = "1.19", features = ["js"] } -walkdir = "2.5.0" wasm-bindgen-futures = "0.4.54" xshell = "0.2.6" zigzag = "0.1.0" diff --git a/vortex-duckdb/Cargo.toml b/vortex-duckdb/Cargo.toml index e5955981400..2d095c1c3e5 100644 --- a/vortex-duckdb/Cargo.toml +++ b/vortex-duckdb/Cargo.toml @@ -24,7 +24,6 @@ path = "src/lib.rs" crate-type = ["staticlib", "cdylib", "rlib"] [dependencies] -anyhow = { workspace = true } async-fs = { workspace = true } async-trait = { workspace = true } bitvec = { workspace = true } @@ -36,15 +35,16 @@ num-traits = { workspace = true } object_store = { workspace = true, features = ["aws"] } parking_lot = { workspace = true } paste = { workspace = true } -tempfile = { workspace = true } tracing = { workspace = true } url = { workspace = true } vortex = { workspace = true, features = ["files", "tokio", "object_store"] } vortex-utils = { workspace = true, features = ["dashmap"] } [dev-dependencies] +anyhow = { workspace = true } jiff = { workspace = true } rstest = { workspace = true } +tempfile = { workspace = true } vortex-array = { workspace = true, features = ["_test-harness"] } vortex-runend = { workspace = true } vortex-sequence = { workspace = true } @@ -56,7 +56,5 @@ workspace = true bindgen = { workspace = true } cbindgen = { workspace = true } cc = { workspace = true } -once_cell = { workspace = true } reqwest = { workspace = true } -walkdir = { workspace = true } zip = { workspace = true } diff --git a/vortex-duckdb/build.rs b/vortex-duckdb/build.rs index 1908b31ba77..be20dde4b8a 100644 --- a/vortex-duckdb/build.rs +++ b/vortex-duckdb/build.rs @@ -2,65 +2,64 @@ // SPDX-FileCopyrightText: Copyright the Vortex contributors #![allow(clippy::unwrap_used)] -#![allow(clippy::expect_used)] -#![allow(clippy::panic)] +// exit(1) + cargo:error= doesn't provide a double-traceback like panic!() +#![allow(clippy::exit)] -use std::borrow::ToOwned; use std::env; use std::fs; -use std::io::ErrorKind; use std::path::Path; use std::path::PathBuf; use std::process::Command; +use std::process::exit; use bindgen::Abi; -use once_cell::sync::Lazy; - -static DUCKDB_VERSION: Lazy = Lazy::new(|| { - // Override the DuckDB version via environment variable in case of an extension build. - // `DUCKDB_VERSION` is set by the extension build in the `duckdb-vortex` repo. - // - // This is to ensure that we don't implicitly build against a different DuckDB version during - // an extension build which might lead to subtle ABI breaks, e.g. reordering fields in C++ structs. - if let Ok(version) = env::var("DUCKDB_VERSION") { - // DUCKDB_VERSION env var can be set by: - // - The extension build in `vortex-data/duckdb-vortex` repo - // - Developers who want to test against a specific version/commit - parse_version(&version) - } else { - // The default DuckDB version to use when DUCKDB_VERSION env var is not set. - DuckDBVersion::Release("1.5.0".to_owned()) - } -}); const DUCKDB_RELEASES_URL: &str = "https://github.com/duckdb/duckdb/releases/download"; +const DUCKDB_SOURCE_RELEASE_URL: &str = "https://github.com/duckdb/duckdb/archive/refs/tags"; +const DUCKDB_SOURCE_COMMIT_URL: &str = "https://github.com/duckdb/duckdb/archive"; + +const BUILD_ARTIFACTS: [&str; 3] = ["libduckdb.dylib", "libduckdb.so", "libduckdb_static.a"]; + +const SOURCE_FILES: [&str; 18] = [ + "cpp/client_context.cpp", + "cpp/config.cpp", + "cpp/copy_function.cpp", + "cpp/data.cpp", + "cpp/data_chunk.cpp", + "cpp/error.cpp", + "cpp/expr.cpp", + "cpp/file_system.cpp", + "cpp/logical_type.cpp", + "cpp/object_cache.cpp", + "cpp/replacement_scan.cpp", + "cpp/reusable_dict.cpp", + "cpp/scalar_function.cpp", + "cpp/table_filter.cpp", + "cpp/table_function.cpp", + "cpp/value.cpp", + "cpp/vector.cpp", + "cpp/vector_buffer.cpp", +]; + +const DOWNLOAD_MAX_RETRIES: i32 = 3; +const DOWNLOAD_TIMEOUT: u64 = 90; -/// Represents either a released DuckDB version or a specific commit hash. #[derive(Debug, Clone)] -#[allow(dead_code)] enum DuckDBVersion { - /// A released DuckDB version of the form "X.Y.Z". - /// This mode will download pre-compiled dynamic libraries from GitHub releases. - Release(String), - /// A commit hash from `github.com/duckdb/duckdb`. - /// This mode will download source and build DuckDB from scratch. - Commit(String), + Release(String), // i.e. X.Y.Z. Download pre-compiled libraries from GitHub releases. + Commit(String), // Download source and build DuckDB from scratch. } -impl DuckDBVersion { - /// Returns the directory name suffix for this version. - fn dir_suffix(&self) -> String { +impl std::fmt::Display for DuckDBVersion { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { match self { - DuckDBVersion::Release(v) => format!("v{v}"), - DuckDBVersion::Commit(c) => c.clone(), + DuckDBVersion::Release(v) => write!(f, "v{v}"), + DuckDBVersion::Commit(c) => write!(f, "{c}"), } } +} - /// Returns true if this is a release version (not a commit). - fn is_release(&self) -> bool { - matches!(self, DuckDBVersion::Release(_)) - } - +impl DuckDBVersion { /// Returns the name of the extracted source directory inside the zip archive. /// GitHub archives extract to `duckdb-{version}` for tags and `duckdb-{commit}` for commits. fn archive_inner_dir_name(&self) -> String { @@ -71,382 +70,219 @@ impl DuckDBVersion { } } -/// Parse a version string into a DuckDBVersion. -/// - Strings starting with "v" followed by semver are treated as releases -/// - Pure semver strings (X.Y.Z) are treated as releases -/// - Everything else (e.g., commit hashes) are treated as commits -fn parse_version(s: &str) -> DuckDBVersion { - let s = s.trim(); - - // Strip leading 'v' if present - let version_str = s.strip_prefix('v').unwrap_or(s); - - // Check if it looks like a semver release (X.Y.Z pattern) - let parts: Vec<&str> = version_str.split('.').collect(); - if parts.len() >= 2 && parts.iter().all(|p| p.chars().all(|c| c.is_ascii_digit())) { - DuckDBVersion::Release(version_str.to_owned()) - } else { - panic!("Invalid DuckDB version: {s}"); +impl From<&String> for DuckDBVersion { + fn from(s: &String) -> Self { + let s = s.trim(); + let version_str = s.strip_prefix('v').unwrap_or(s); + let parts: Vec<&str> = version_str.split('.').collect(); + if parts.len() >= 2 && parts.iter().all(|p| p.chars().all(|c| c.is_ascii_digit())) { + DuckDBVersion::Release(version_str.to_owned()) + } else { + DuckDBVersion::Commit(version_str.to_owned()) + } } } -/// Create an HTTP client with appropriate timeout settings. -fn http_client() -> Result> { +fn download_url(url: &str, path: &Path) { + if path.exists() { + return; + } + println!("cargo:info=Downloading DuckDB from {url}"); + let timeout_secs = env::var("CARGO_HTTP_TIMEOUT") .or_else(|_| env::var("HTTP_TIMEOUT")) .ok() .and_then(|s| s.parse().ok()) - .unwrap_or(90); - + .unwrap_or(DOWNLOAD_TIMEOUT); let client = reqwest::blocking::Client::builder() .timeout(std::time::Duration::from_secs(timeout_secs)) - .build()?; - Ok(client) -} - -/// Download a URL with retry logic for transient failures (5xx, timeouts, connection errors). -fn download_with_retries(url: &str) -> Result, Box> { - let client = http_client()?; - let max_retries = 3; + .build() + .unwrap(); - for attempt in 1..=max_retries { + for attempt in 1..=DOWNLOAD_MAX_RETRIES { match client.get(url).send() { Ok(response) if response.status().is_success() => { - return Ok(response.bytes()?.to_vec()); + let bytes = response.bytes().unwrap().to_vec(); + fs::write(path, bytes).unwrap(); + println!("cargo:info=Downloaded to {url}"); + return; } Ok(response) if response.status().is_server_error() => { + let status = response.status(); println!( - "cargo:warning=Download attempt {attempt}/{max_retries} failed: HTTP {} for {url}", - response.status() + "cargo:warning=Download attempt \ + {attempt}/{DOWNLOAD_MAX_RETRIES} failed: HTTP {status} for {url}" ); } - Ok(response) => { - // Client errors (4xx) are not retryable - return Err(format!("Failed to download {url}: HTTP {}", response.status()).into()); - } Err(e) => { println!( - "cargo:warning=Download attempt {attempt}/{max_retries} failed: {e} for {url}" + "cargo:warning=Download attempt \ + {attempt}/{DOWNLOAD_MAX_RETRIES} failed: {e}" ); } + // Client errors (4xx) are not retryable + Ok(response) => { + let status = response.status(); + println!("cargo:error=Failed to download {url}: HTTP {status}"); + exit(1); + } } - if attempt < max_retries { + if attempt < DOWNLOAD_MAX_RETRIES { let delay = std::time::Duration::from_secs(2u64.pow(attempt as u32)); println!("cargo:warning=Retrying in {}s...", delay.as_secs()); std::thread::sleep(delay); } } - Err(format!("Failed to download {url} after {max_retries} attempts").into()) -} - -/// Get the target-specific platform and architecture for downloading prebuilt libraries. -fn platform_arch() -> Result<(&'static str, &'static str), Box> { - let target = env::var("TARGET")?; - match target.as_str() { - "aarch64-apple-darwin" | "x86_64-apple-darwin" => Ok(("osx", "universal")), - "x86_64-unknown-linux-gnu" => Ok(("linux", "amd64")), - "aarch64-unknown-linux-gnu" => Ok(("linux", "arm64")), - _ => Err(format!("Unsupported target: {target}").into()), - } -} - -/// Get the base target directory for DuckDB artifacts. -fn target_dir() -> Result> { - let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR")?); - Ok(manifest_dir.parent().unwrap().join("target")) -} - -/// Download prebuilt DuckDB libraries from GitHub releases. -/// Only valid for release versions. -fn download_duckdb_lib_archive() -> Result> { - let version = match &*DUCKDB_VERSION { - DuckDBVersion::Release(v) => v, - DuckDBVersion::Commit(_) => { - return Err("Cannot download prebuilt libraries for commit hashes".into()); - } - }; - - let target_dir = target_dir()?; - let duckdb_lib_dir = target_dir.join(format!("duckdb-lib-v{version}")); - - let (platform, arch) = platform_arch()?; - let archive_name = format!("libduckdb-{platform}-{arch}.zip"); - let url = format!("{DUCKDB_RELEASES_URL}/v{version}/{archive_name}"); - let archive_path = duckdb_lib_dir.join(&archive_name); - - // Create directory if it doesn't exist - fs::create_dir_all(&duckdb_lib_dir)?; - - // Download if archive doesn't exist - if !archive_path.exists() { - println!("cargo:info=Downloading DuckDB libraries from {url}"); - let bytes = download_with_retries(&url)?; - fs::write(&archive_path, &bytes)?; - println!("cargo:info=Downloaded to {}", archive_path.display()); - } - - Ok(archive_path) + println!("cargo:error=Failed to download {url} after {DOWNLOAD_MAX_RETRIES} attempts"); + exit(1); } -/// Extract DuckDB libraries from the downloaded archive. -fn extract_duckdb_libraries(archive_path: &Path) -> Result> { - let duckdb_lib_dir = archive_path - .parent() - .ok_or("Invalid archive path")? - .to_path_buf(); - - // Check if already extracted (check for both .dylib and .so) - let dylib_exists = duckdb_lib_dir.join("libduckdb.dylib").exists(); - let so_exists = duckdb_lib_dir.join("libduckdb.so").exists(); - - if dylib_exists || so_exists { - println!("cargo:info=DuckDB libraries already extracted, skipping"); - return Ok(duckdb_lib_dir); - } - +fn extract(archive: &Path, dest: &Path) { println!( - "cargo:info=Extracting DuckDB libraries to {}", - duckdb_lib_dir.display() + "cargo:info=Extracting {} to {}", + archive.display(), + dest.display() ); - let file = fs::File::open(archive_path)?; - let mut archive = zip::ZipArchive::new(file)?; - archive.extract(&duckdb_lib_dir)?; - - Ok(duckdb_lib_dir) + let file = fs::File::open(archive).unwrap(); + zip::ZipArchive::new(file).unwrap().extract(dest).unwrap(); } -/// Download DuckDB source code archive from GitHub. -/// Works for both release versions and commit hashes. -fn download_duckdb_source_archive() -> Result> { - let target_dir = target_dir()?; - let source_dir = target_dir.join(format!("duckdb-source-{}", DUCKDB_VERSION.dir_suffix())); - - let url = match &*DUCKDB_VERSION { - DuckDBVersion::Release(v) => { - format!("https://github.com/duckdb/duckdb/archive/refs/tags/v{v}.zip") - } - DuckDBVersion::Commit(c) => { - format!("https://github.com/duckdb/duckdb/archive/{c}.zip") +fn download(version: &DuckDBVersion, library_dir: &Path) { + let target = env::var("TARGET").unwrap(); + let (platform, arch) = match target.as_str() { + "aarch64-apple-darwin" | "x86_64-apple-darwin" => ("osx", "universal"), + "x86_64-unknown-linux-gnu" => ("linux", "amd64"), + "aarch64-unknown-linux-gnu" => ("linux", "arm64"), + _ => { + println!("cargo:error=Unsupported target {target}"); + exit(1); } }; - let archive_path = source_dir.with_extension("zip"); + let archive_name = format!("libduckdb-{platform}-{arch}.zip"); + let url = format!("{DUCKDB_RELEASES_URL}/{version}/{archive_name}"); + let archive_path = library_dir.join(&archive_name); - // Create directory if it doesn't exist - fs::create_dir_all(&source_dir)?; + fs::create_dir_all(library_dir).unwrap(); + download_url(&url, &archive_path); - // Download if archive doesn't exist - if !archive_path.exists() { - println!("cargo:info=Downloading DuckDB source code from {url}"); - let bytes = download_with_retries(&url)?; - fs::write(&archive_path, &bytes)?; - println!("cargo:info=Downloaded to {}", archive_path.display()); + let duckdb_lib_dir = archive_path.parent().unwrap().to_path_buf(); + for artifact in BUILD_ARTIFACTS { + if duckdb_lib_dir.join(artifact).exists() { + return; + } } - - Ok(source_dir) + extract(&archive_path, &duckdb_lib_dir); } -/// Extract DuckDB source code from the downloaded archive. -fn extract_duckdb_source(source_dir: &Path) -> Result> { - let archive_path = source_dir.with_extension("zip"); - - // Check if already extracted by looking for CMakeLists.txt - let inner_dir_name = DUCKDB_VERSION.archive_inner_dir_name(); - let cmake_file = source_dir.join(&inner_dir_name).join("CMakeLists.txt"); - - if cmake_file.exists() { - println!("cargo:info=DuckDB source already extracted, skipping"); - return Ok(source_dir.to_path_buf()); +fn build_duckdb(version: &DuckDBVersion, duckdb_repo_dir: &Path) { + if let Err(e) = Command::new("make").arg("--version").output() { + println!("cargo:error=make is required to build DuckDB: {e}"); + exit(1); } - - println!( - "cargo:info=Extracting DuckDB source to {}", - source_dir.display() - ); - let file = fs::File::open(&archive_path)?; - let mut archive = zip::ZipArchive::new(file)?; - - // Extract all files, the archive contains a root directory like `duckdb-{version}/` - archive.extract(source_dir)?; - - Ok(source_dir.to_path_buf()) -} - -/// Build DuckDB from source. Used for commit hashes or when VX_DUCKDB_DEBUG is set. -fn build_duckdb( - duckdb_source_dir: &Path, - version: &DuckDBVersion, - debug: bool, -) -> Result> { - let build_type = match debug { - true => "debug", - false => "release", - }; - // Check for ninja - if Command::new("ninja").arg("--version").output().is_err() { - return Err( - "'ninja' is required to build DuckDB. Install it via your package manager.".into(), - ); + if let Err(e) = Command::new("ninja").arg("--version").output() { + println!("cargo:error=ninja is required to build DuckDB: {e}"); + exit(1); } - let inner_dir_name = DUCKDB_VERSION.archive_inner_dir_name(); - let duckdb_repo_dir = duckdb_source_dir.join(&inner_dir_name); - let build_dir = duckdb_repo_dir.join("build").join(build_type); - - let lib_dir = build_dir.join("src"); - let lib_dir_str = lib_dir.display(); - println!("cargo:info=Checking if DuckDB is already built in {lib_dir_str}",); - - let already_built = lib_dir.join("libduckdb.dylib").exists() - || lib_dir.join("libduckdb.so").exists() - || lib_dir - .read_dir() - .map(|mut d| { - d.any(|e| e.is_ok_and(|e| e.file_name().to_string_lossy().starts_with("libduckdb"))) - }) - .unwrap_or(false); - - if !already_built { - println!("cargo:info=Building DuckDB from source (this may take a while)..."); - - // Build with ASAN/TSAN if VX_DUCKDB_SAN=1 - let (asan_option, tsan_option) = - if env::var("VX_DUCKDB_SAN").is_ok_and(|v| matches!(v.as_str(), "1" | "true")) { - ("0", "1") // DISABLE_SANITIZER=0 enables ASAN, THREADSAN=1 enables TSAN - } else { - ("1", "0") - }; - - let mut envs = vec![ - ("GEN", "ninja"), - ("DISABLE_SANITIZER", asan_option), - ("THREADSAN", tsan_option), - ("BUILD_SHELL", "false"), - ("BUILD_UNITTESTS", "false"), - ("ENABLE_UNITTEST_CPP_TESTS", "false"), - ]; - - // If we're building from a commit (likely a pre-release), we need to - // build extensions statically. Otherwise DuckDB tries to load them - // from an http endpoint with version 0.0.1 (all non-tagged builds) - // which doesn't exists. httpfs also requires CURL dev headers - if matches!(version, DuckDBVersion::Commit(_)) { - envs.push(("BUILD_EXTENSIONS", "httpfs;parquet;tpch;tpcds;jemalloc")); + println!("cargo:info=Building DuckDB from source (this may take a while)..."); + let (asan_option, tsan_option) = + if env::var("VX_DUCKDB_SAN").is_ok_and(|v| matches!(v.as_str(), "1" | "true")) { + ("0", "1") // DISABLE_SANITIZER=0 enables ASAN, THREADSAN=1 enables TSAN + } else { + ("1", "0") }; - let output = Command::new("make") - .current_dir(&duckdb_repo_dir) - .envs(envs) - .output()?; - - if !output.status.success() { - return Err(format!( - "Failed to build DuckDB:\nstdout: {}\nstderr: {}", - String::from_utf8_lossy(&output.stdout), - String::from_utf8_lossy(&output.stderr) - ) - .into()); - } + // If we're building from a commit we need to build httpfs and benchmark + // extensions statically, otherwise DuckDB tries to load them from an http + // endpoint with version 0.0.1 (all non-tagged builds) which doesn't exist. + // httpfs static build also requires CURL dev headers + let static_extensions = match version { + DuckDBVersion::Release(_) => "parquet;jemalloc", + DuckDBVersion::Commit(_) => "parquet;jemalloc;httpfs;tpch;tpcds", + }; - println!("cargo:info=DuckDB build completed successfully"); - } else { - println!("cargo:info=DuckDB already built, skipping build"); + let envs = [ + ("GEN", "ninja"), + ("DISABLE_SANITIZER", asan_option), + ("THREADSAN", tsan_option), + ("BUILD_SHELL", "false"), + ("BUILD_UNITTESTS", "false"), + ("ENABLE_UNITTEST_CPP_TESTS", "false"), + ("BUILD_EXTENSIONS", static_extensions), + ]; + + let output = Command::new("make") + .current_dir(duckdb_repo_dir) + .envs(envs) + .output() + .unwrap(); + if !output.status.success() { + let stdout = String::from_utf8_lossy(&output.stdout); + let stderr = String::from_utf8_lossy(&output.stderr); + println!("cargo:error=Failed to build DuckDB:\nstdout: {stdout}\nstderr: {stderr}"); + exit(1); } - // Copy libraries to a stable location - let target_dir = target_dir()?; - let duckdb_library_dir = target_dir.join(format!("duckdb-lib-{}", DUCKDB_VERSION.dir_suffix())); + println!("cargo:info=DuckDB build completed"); +} - // Only copy if the destination doesn't have the libraries - if !(duckdb_library_dir.join("libduckdb.dylib").exists() - || duckdb_library_dir.join("libduckdb.so").exists()) - { - // Clean and recreate destination - match fs::remove_dir_all(&duckdb_library_dir) { - Err(err) if err.kind() == ErrorKind::NotFound => (), - otherwise => otherwise?, - } - fs::create_dir_all(&duckdb_library_dir)?; - - // Copy .dylib and .so files - for entry in fs::read_dir(lib_dir)? { - let entry = entry?; - let path = entry.path(); - - if path - .file_name() - .and_then(|name| name.to_str()) - .is_some_and(|name| name.starts_with("libduckdb")) - { - let dest = duckdb_library_dir.join(entry.file_name()); - fs::copy(&path, &dest)?; - } +fn try_build_duckdb( + source_dir: &Path, + library_dir: &Path, + version: &DuckDBVersion, + build_type: &str, +) { + let inner_dir_name = version.archive_inner_dir_name(); + let repo_dir = source_dir.join(&inner_dir_name); + let build_dir = repo_dir.join("build").join(build_type); + let build_src_dir = build_dir.join("src"); + + let mut build = true; + for artifact in BUILD_ARTIFACTS { + let path = build_src_dir.join(artifact); + if path.exists() { + println!("cargo:info=Found {artifact} in {}", path.display()); + build = false; + break; } } - Ok(duckdb_library_dir) -} - -/// Get the path to the DuckDB include directory within the source tree. -fn duckdb_include_path(source_dir: &Path) -> PathBuf { - let inner_dir_name = DUCKDB_VERSION.archive_inner_dir_name(); - source_dir.join(inner_dir_name).join("src").join("include") -} - -fn main() { - // Emit rerun-if-env-changed for all relevant environment variables - println!("cargo:rerun-if-env-changed=DUCKDB_VERSION"); - println!("cargo:rerun-if-env-changed=VX_DUCKDB_DEBUG"); - println!("cargo:rerun-if-env-changed=VX_DUCKDB_SAN"); - println!("cargo:rerun-if-env-changed=CARGO_HTTP_TIMEOUT"); - println!("cargo:rerun-if-env-changed=HTTP_TIMEOUT"); - println!("cargo:rerun-if-env-changed=TARGET"); - - let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); - let duckdb_symlink = crate_dir.join("duckdb"); - - match &*DUCKDB_VERSION { - DuckDBVersion::Release(v) => println!("cargo:info=Using DuckDB release version: {v}"), - DuckDBVersion::Commit(c) => println!("cargo:info=Using DuckDB commit: {c}"), + if build { + build_duckdb(version, &repo_dir); } - // Always download and extract source (needed for headers) - let source_dir = download_duckdb_source_archive().unwrap(); - let extracted_source_path = extract_duckdb_source(&source_dir).unwrap(); - - // Create/update symlink to source directory - // Remove existing symlink/directory first (ignore errors if they don't exist) - drop(fs::remove_file(&duckdb_symlink)); - drop(fs::remove_dir_all(&duckdb_symlink)); - std::os::unix::fs::symlink(&extracted_source_path, &duckdb_symlink).unwrap(); + let library_dir_str = library_dir.display(); + if let Err(err) = fs::remove_dir_all(library_dir) + && err.kind() != std::io::ErrorKind::NotFound + { + println!("cargo:error=Failed to remove {library_dir_str}: {err}"); + exit(1); + }; + fs::create_dir_all(library_dir).unwrap(); - let use_debug_build = - env::var("VX_DUCKDB_DEBUG").is_ok_and(|v| matches!(v.as_str(), "1" | "true")); - println!("cargo:info=DuckDB debug build: {use_debug_build}"); - - let library_path = if use_debug_build || !DUCKDB_VERSION.is_release() { - // Build from source for: - // - Commit hashes (no prebuilt available) - // - When VX_DUCKDB_DEBUG=1 (user wants debug build) - match build_duckdb(&extracted_source_path, &DUCKDB_VERSION, use_debug_build) { - Ok(path) => path, - Err(err) => { - println!("cargo:error={err}"); - panic!("duckdb build failed"); - } + let mut found_artifact = false; + for artifact in BUILD_ARTIFACTS { + let src = build_src_dir.join(artifact); + if !src.exists() { + continue; } - } else { - // Download prebuilt libraries for release versions - let archive_path = download_duckdb_lib_archive().unwrap(); - extract_duckdb_libraries(&archive_path).unwrap() - }; + let dest = library_dir.join(artifact); + fs::copy(&src, &dest).unwrap(); + found_artifact = true; + } - let duckdb_include_path = duckdb_include_path(&extracted_source_path); + if !found_artifact { + let artifacts = BUILD_ARTIFACTS.join(","); + println!("cargo:error=Failed to find any of {artifacts} after build"); + exit(1); + } +} - // Generate the _imported_ bindings from our C++ code. - bindgen::Builder::default() +fn c2rust(crate_dir: &Path, duckdb_include_dir: &Path) { + let bindings = bindgen::Builder::default() .header("cpp/include/duckdb_vx.h") .override_abi(Abi::CUnwind, ".*") // Allow for auto-generated cpp.rs code. @@ -465,100 +301,150 @@ fn main() { .rustified_enum("DUCKDB_VX_VECTOR_TYPE") .rustified_non_exhaustive_enum("DUCKDB_TYPE") .size_t_is_usize(true) - .clang_arg(format!("-I{}", duckdb_include_path.display())) + .clang_arg(format!("-I{}", duckdb_include_dir.display())) .clang_arg(format!("-I{}", crate_dir.join("cpp/include").display())) .generate_comments(true) // Tell cargo to invalidate the built crate whenever any of the // included header files changed. .parse_callbacks(Box::new(bindgen::CargoCallbacks::new())) - // Finish the builder and generate the bindings. - .generate() - .expect("Unable to generate bindings") - .write_to_file(crate_dir.join("src/cpp.rs")) - .expect("Couldn't write bindings!"); - - // Link against DuckDB dylib. - println!("cargo:rustc-link-search=native={}", library_path.display()); - println!("cargo:rustc-link-lib=dylib=duckdb"); - - // Set rpath for binaries built directly from this crate (this is not inherited by downstream crates). - println!("cargo:rustc-link-arg=-Wl,-rpath,{}", library_path.display()); + .generate(); - // Export the library path for downstream crates via the `links` manifest key. - // Downstream crates can access this via `env::var("DEP_DUCKDB_LIB_DIR")` in their build.rs - // and add their own rpath: - // - // if let Ok(duckdb_lib) = env::var("DEP_DUCKDB_LIB_DIR") { - // println!("cargo:rustc-link-arg=-Wl,-rpath,{duckdb_lib}"); - // } - // - // Alternatively, set LD_LIBRARY_PATH (Linux) or DYLD_LIBRARY_PATH (macOS) at runtime: - // LD_LIBRARY_PATH=/path/to/target/duckdb-lib-vX.Y.Z cargo run --bin ... - // - println!("cargo:lib_dir={}", library_path.display()); + let bindings = match bindings { + Ok(b) => b, + Err(e) => { + println!("cargo:error=Failed to generate Rust bindings: {e}"); + exit(1); + } + }; + if let Err(e) = bindings.write_to_file(crate_dir.join("src/cpp.rs")) { + println!("cargo:error=Failed to write Rust bindings: {e}"); + exit(1); + } +} - // Compile our C++ code that exposes additional DuckDB functionality. +fn cpp(duckdb_include_dir: &Path) { cc::Build::new() .std("c++20") - // Enable compiler warnings. - .flag("-Wall") - .flag("-Wextra") - .flag("-Wpedantic") + // Duckdb sources fail -Wno-unused-parameter + .flags(["-Wall", "-Wextra", "-Wpedantic", "-Wno-unused-parameter"]) .cpp(true) - .include(&duckdb_include_path) + .include(duckdb_include_dir) .include("cpp/include") - .file("cpp/client_context.cpp") - .file("cpp/config.cpp") - .file("cpp/copy_function.cpp") - .file("cpp/data.cpp") - .file("cpp/data_chunk.cpp") - .file("cpp/error.cpp") - .file("cpp/expr.cpp") - .file("cpp/file_system.cpp") - .file("cpp/logical_type.cpp") - .file("cpp/object_cache.cpp") - .file("cpp/reusable_dict.cpp") - .file("cpp/replacement_scan.cpp") - .file("cpp/scalar_function.cpp") - .file("cpp/table_filter.cpp") - .file("cpp/table_function.cpp") - .file("cpp/value.cpp") - .file("cpp/vector.cpp") - .file("cpp/vector_buffer.cpp") + .files(SOURCE_FILES) .compile("vortex-duckdb-extras"); + // bindgen generates rerun-if-changed for .h/.hpp files + for e in SOURCE_FILES { + println!("cargo:rerun-if-changed={e}"); + } +} - // Generate the _exported_ bindings from our Rust code. - let generated_header = crate_dir.join("include/vortex.h"); - cbindgen::Builder::new() +fn rust2c(crate_dir: &Path) { + let header = crate_dir.join("include/vortex.h"); + let output = cbindgen::Builder::new() .with_config(cbindgen::Config::from_file(crate_dir.join("cbindgen.toml")).unwrap()) - .with_crate(&crate_dir) + .with_crate(crate_dir) .with_no_includes() - .generate() - .expect("error: Unable to generate bindings for vortex.h") - .write_to_file(&generated_header); - - // Run clang-format on the generated header. - if let Ok(status) = Command::new("clang-format") - .arg("-i") - .arg("--style=file") - .arg(&generated_header) - .status() - { + .generate(); + match output { + Ok(bindings) => bindings.write_to_file(&header), + Err(e) => { + println!("cargo:error=Failed to generate cbindgen bindings for vortex.h: {e}"); + exit(1); + } + }; + + let mut cmd = Command::new("clang-format"); + let format = cmd.arg("-i").arg("--style=file").arg(&header); + if let Ok(status) = format.status() { if !status.success() { println!("cargo:warning=clang-format exited with status {status}"); } } else { println!("cargo:warning=clang-format not found, skipping formatting of generated header"); } +} - // Watch C/C++ source files for changes. - for entry in walkdir::WalkDir::new("cpp/").into_iter().flatten() { - if entry - .path() - .extension() - .is_some_and(|ext| ext == "cpp" || ext == "h" || ext == "hpp") - { - println!("cargo:rerun-if-changed={}", entry.path().display()); - } +fn main() { + println!("cargo:rerun-if-env-changed=DUCKDB_VERSION"); + println!("cargo:rerun-if-env-changed=VX_DUCKDB_DEBUG"); + println!("cargo:rerun-if-env-changed=VX_DUCKDB_SAN"); + println!("cargo:rerun-if-env-changed=CARGO_HTTP_TIMEOUT"); + println!("cargo:rerun-if-env-changed=HTTP_TIMEOUT"); + println!("cargo:rerun-if-env-changed=TARGET"); + + // `DUCKDB_VERSION` is set by the extension build in CI. + // This is to ensure we don't implicitly build against a different DuckDB + // version during an extension build which might lead to subtle ABI breaks, + // e.g. reordering fields in C++ structs. + let version = env::var("DUCKDB_VERSION") + // You can also change this version to a commit hash + .unwrap_or_else(|_| "1.5.0".to_owned()); + let version = DuckDBVersion::from(&version); + match &version { + DuckDBVersion::Release(v) => println!("cargo:info=Using DuckDB release version: {v}"), + DuckDBVersion::Commit(c) => println!("cargo:info=Using DuckDB commit: {c}"), } + + let crate_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap()); + let duckdb_dir = crate_dir.join("duckdb"); + let target_dir = crate_dir.parent().unwrap().join("target"); + let library_dir = target_dir.join(format!("duckdb-lib-{version}")); + + let library_dir_str = library_dir.display(); + println!("cargo:rustc-link-search=native={library_dir_str}"); + println!("cargo:rustc-link-lib=dylib=duckdb"); + + // Set rpath for binaries built directly from this crate. This is not + // inherited by downstream crates. + println!("cargo:rustc-link-arg=-Wl,-rpath,{library_dir_str}"); + + // Export the library path for downstream crates via the `links` manifest key. + // Downstream crates can access this via `env::var("DEP_DUCKDB_LIB_DIR")` in their build.rs + // and add their own rpath: + // + // if let Ok(duckdb_lib) = env::var("DEP_DUCKDB_LIB_DIR") { + // println!("cargo:rustc-link-arg=-Wl,-rpath,{duckdb_lib}"); + // } + // + // Alternatively, set LD_LIBRARY_PATH (Linux) or DYLD_LIBRARY_PATH (macOS) at runtime: + // LD_LIBRARY_PATH=/path/to/target/duckdb-lib-vX.Y.Z cargo run --bin ... + println!("cargo:lib_dir={library_dir_str}"); + + let source_dir = target_dir.join(format!("duckdb-source-{version}")); + let source_archive_url = match &version { + DuckDBVersion::Release(v) => format!("{DUCKDB_SOURCE_RELEASE_URL}/v{v}.zip"), + DuckDBVersion::Commit(c) => format!("{DUCKDB_SOURCE_COMMIT_URL}/{c}.zip"), + }; + + fs::create_dir_all(&source_dir).unwrap(); + let source_archive_path = source_dir.with_extension("zip"); + download_url(&source_archive_url, &source_archive_path); + + let inner_dir = source_dir.join(version.archive_inner_dir_name()); + if !inner_dir.join("CMakeLists.txt").exists() { + extract(&source_archive_path, &source_dir); + } + + drop(fs::remove_file(&duckdb_dir)); + drop(fs::remove_dir_all(&duckdb_dir)); + std::os::unix::fs::symlink(&source_dir, &duckdb_dir).unwrap(); + + let has_debug_env = + env::var("VX_DUCKDB_DEBUG").is_ok_and(|v| matches!(v.as_str(), "1" | "true")); + let build_type = match has_debug_env { + true => "debug", + false => "release", + }; + println!("cargo:info=building DuckDB in {build_type} mode"); + + if has_debug_env || matches!(version, DuckDBVersion::Commit(_)) { + try_build_duckdb(&source_dir, &library_dir, &version, build_type); + } else { + download(&version, &library_dir); + }; + + let duckdb_include_dir = inner_dir.join("src").join("include"); + c2rust(&crate_dir, &duckdb_include_dir); + cpp(&duckdb_include_dir); + rust2c(&crate_dir); } diff --git a/vortex-duckdb/src/convert/expr.rs b/vortex-duckdb/src/convert/expr.rs index c2d5a37813d..80d92ede449 100644 --- a/vortex-duckdb/src/convert/expr.rs +++ b/vortex-duckdb/src/convert/expr.rs @@ -3,7 +3,6 @@ use std::sync::Arc; -use itertools::Itertools; use tracing::debug; use vortex::dtype::Nullability; use vortex::error::VortexError; @@ -97,7 +96,7 @@ pub fn try_from_bound_expression( DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_NOT | DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_IS_NULL | DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_OPERATOR_IS_NOT_NULL => { - let children = operator.children().collect_vec(); + let children: Vec<_> = operator.children().collect(); assert_eq!(children.len(), 1); let Some(child) = try_from_bound_expression(children[0])? else { return Ok(None); @@ -113,7 +112,7 @@ pub fn try_from_bound_expression( } DUCKDB_VX_EXPR_TYPE::DUCKDB_VX_EXPR_TYPE_COMPARE_IN => { // First child is element, rest form the list. - let children = operator.children().collect_vec(); + let children: Vec<_> = operator.children().collect(); assert!(children.len() >= 2); let Some(element) = try_from_bound_expression(children[0])? else { return Ok(None); @@ -153,7 +152,7 @@ pub fn try_from_bound_expression( }, duckdb::ExpressionClass::BoundFunction(func) => match func.scalar_function.name() { DUCKDB_FUNCTION_NAME_CONTAINS => { - let children = func.children().collect_vec(); + let children: Vec<_> = func.children().collect(); assert_eq!(children.len(), 2); let Some(value) = try_from_bound_expression(children[0])? else { return Ok(None); diff --git a/vortex-duckdb/src/datasource.rs b/vortex-duckdb/src/datasource.rs index 2d896e02787..9e4d39c677d 100644 --- a/vortex-duckdb/src/datasource.rs +++ b/vortex-duckdb/src/datasource.rs @@ -130,7 +130,7 @@ impl Debug for DataSourceBindData { .filter_exprs .iter() .map(|e| e.to_string()) - .collect_vec(), + .collect::>(), ) .finish() } @@ -523,7 +523,7 @@ fn extract_table_filter_expr( }; table_filter_exprs.extend(additional_filters.iter().cloned()); - Ok(and_collect(table_filter_exprs.into_iter().collect_vec())) + Ok(and_collect(table_filter_exprs)) } #[cfg(test)] diff --git a/vortex-duckdb/src/duckdb/copy_function/callback.rs b/vortex-duckdb/src/duckdb/copy_function/callback.rs index 9b6360ae792..0bc303fe1cb 100644 --- a/vortex-duckdb/src/duckdb/copy_function/callback.rs +++ b/vortex-duckdb/src/duckdb/copy_function/callback.rs @@ -6,7 +6,6 @@ use std::os::raw::c_char; use std::os::raw::c_ulong; use std::os::raw::c_void; -use itertools::Itertools; use num_traits::AsPrimitive; use vortex::error::VortexExpect; @@ -39,12 +38,12 @@ pub(crate) unsafe extern "C-unwind" fn bind_callback( .to_string_lossy() .into_owned() }) - .collect_vec(); + .collect(); let column_types = unsafe { std::slice::from_raw_parts(column_types, column_type_count.as_()) } .iter() .map(|c| unsafe { LogicalType::borrow(*c) }) - .collect_vec(); + .collect(); try_or_null(error_out, || { let bind_data = T::bind(column_names, column_types)?; diff --git a/vortex-duckdb/src/exporter/bool.rs b/vortex-duckdb/src/exporter/bool.rs index 39a4a456822..75dfa195bc9 100644 --- a/vortex-duckdb/src/exporter/bool.rs +++ b/vortex-duckdb/src/exporter/bool.rs @@ -1,7 +1,6 @@ // SPDX-License-Identifier: Apache-2.0 // SPDX-FileCopyrightText: Copyright the Vortex contributors -use itertools::Itertools; use vortex::array::ExecutionCtx; use vortex::array::arrays::BoolArray; use vortex::buffer::BitBuffer; @@ -51,7 +50,7 @@ impl ColumnExporter for BoolExporter { .bit_buffer .slice(offset..(offset + len)) .iter() - .collect_vec(), + .collect::>(), ); Ok(()) @@ -109,7 +108,7 @@ mod tests { r#"Chunk - [1 Columns] - FLAT BOOLEAN: 65 = [ {}] "#, - iter::repeat_n("true", 65).join(", ") + iter::repeat_n("true", 65).collect::>().join(", ") ) ); } diff --git a/vortex-duckdb/src/exporter/primitive.rs b/vortex-duckdb/src/exporter/primitive.rs index 4ea2a4c92bf..5e4a3f7e121 100644 --- a/vortex-duckdb/src/exporter/primitive.rs +++ b/vortex-duckdb/src/exporter/primitive.rs @@ -72,7 +72,6 @@ impl ColumnExporter for PrimitiveExporter { #[cfg(test)] mod tests { - use itertools::Itertools; use vortex_array::VortexSessionExecute; use super::*; @@ -112,9 +111,9 @@ mod tests { let arr = PrimitiveArray::from_iter(0..len as i32); { - let mut chunk = (0..ARRAY_COUNT) + let mut chunk: Vec = (0..ARRAY_COUNT) .map(|_| DataChunk::new([LogicalType::new(cpp::duckdb_type::DUCKDB_TYPE_INTEGER)])) - .collect_vec(); + .collect(); for i in 0..ARRAY_COUNT { let mut ctx = SESSION.create_execution_ctx(); @@ -137,6 +136,7 @@ mod tests { "#, &(i * vector_size..(i + 1) * vector_size) .map(|i| i.to_string()) + .collect::>() .join(", ") ) ); diff --git a/vortex-duckdb/src/exporter/varbinview.rs b/vortex-duckdb/src/exporter/varbinview.rs index a48f42b41eb..13a59d242ef 100644 --- a/vortex-duckdb/src/exporter/varbinview.rs +++ b/vortex-duckdb/src/exporter/varbinview.rs @@ -4,7 +4,6 @@ use std::ffi::c_char; use std::sync::Arc; -use itertools::Itertools; use vortex::array::ExecutionCtx; use vortex::array::arrays::VarBinViewArray; use vortex::array::arrays::varbinview::BinaryView; @@ -45,19 +44,14 @@ pub(crate) fn new_exporter( return Ok(all_invalid::new_exporter(len, <ype)); } - let buffers = buffers - .iter() - .cloned() - .map(|b| b.unwrap_host()) - .collect_vec(); - + let buffers: Vec<_> = buffers.iter().cloned().map(|b| b.unwrap_host()).collect(); let buffers: Arc<[ByteBuffer]> = Arc::from(buffers); Ok(validity::new_exporter( validity, Box::new(VarBinViewExporter { views: Buffer::::from_byte_buffer(views.unwrap_host()), - vector_buffers: buffers.iter().cloned().map(VectorBuffer::new).collect_vec(), + vector_buffers: buffers.iter().cloned().map(VectorBuffer::new).collect(), buffers, }), ))